Compare commits
226 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 | ||
| c69bf07b27 | |||
| b3cf8e3893 | |||
| 17118298bd | |||
| 65bcc9ce81 | |||
| 0cdfbc5875 | |||
| cf9b54b600 | |||
| 0bfb3fe954 | |||
| 3053cb0710 | |||
| 2038489961 | |||
| e35484013e | |||
| 2cdf034bb0 | |||
| b89580c24d | |||
| df1385d3d6 | |||
| f8cd9a7376 | |||
| d52ec67f8f | |||
| 61c7abaa80 | |||
| d60f7758ba | |||
| 52c3a24b76 | |||
| 028bb30333 | |||
| 7d64e5d215 | |||
| 51b721aeb3 | |||
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 | ||
|
|
e306250da7 | ||
|
|
c5b2081ac9 | ||
| 434528083e | |||
| 30aa30cd67 | |||
| 4f76e1de21 | |||
| 3732e64a4a | |||
| 0d925299ff | |||
| a8d5e019a5 | |||
| 72ec086568 | |||
| 7a0b0934df | |||
| d8ca0dca2c | |||
| d90250f80a | |||
| 8d6eaef5de | |||
| 732bf4cbab | |||
| fa6d905a10 | |||
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 | |||
| f87461ee4a | |||
| a636146dbd | |||
|
|
303de2df04 | ||
|
|
95124d228f | ||
|
|
54338dbae5 | ||
|
|
2be7ae6d28 | ||
|
|
b1a5035edd | ||
|
|
8fc986c933 | ||
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 | |||
| 0fb8f2777f | |||
| bf182daa89 | |||
| 457ea1cf04 | |||
| bf6ecab4f0 | |||
| 02e44b1172 | |||
| 2ceaa0d0ca | |||
| 9482ba20a2 | |||
| 813e2f86a9 | |||
| 58a6da9b44 | |||
| f4a19c0a00 | |||
| 9e3dcf9b4d | |||
| 098e19f760 | |||
| e16d0f34b5 | |||
|
|
525ed8b8fc | ||
|
|
4f94ebcb2c | ||
|
|
05c1fde233 | ||
| 825ef6b98a | |||
| ba16021cdb | |||
|
|
bb1218ddd4 | ||
|
|
65faae8ede | ||
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b | ||
|
|
3f41a026ca | ||
|
|
0ee4f46537 | ||
| 8db40b098a | |||
| 16e7ae00e7 | |||
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d | |||
|
|
0a4bb596f6 | ||
|
|
531d1ca366 | ||
|
|
93cfa78e8c | ||
|
|
1358485f2b | ||
| 8fe20ba678 | |||
| d973231f37 | |||
| f5d175f488 | |||
| fa00667750 | |||
|
|
c7d2816a7f | ||
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 | |||
| 5b9015451e | |||
| d1a6863ceb | |||
| f9aa05de8e | |||
| a9ccea8cca | |||
| fc5c985fb5 | |||
| 5eb3baddb4 | |||
| a6ac13b5d3 | |||
| 4003cb7676 | |||
| 2875313ba0 | |||
| f1621efee4 | |||
| 4461249cc3 | |||
| e609fbbc26 | |||
| cc2b49ea41 | |||
| 33e0a5bef2 | |||
| 38e79143eb | |||
| 25af2df23a | |||
| 20abff7f90 | |||
| a14ec8631c | |||
| f58c7e58d3 | |||
| bf47c8dbd2 | |||
| 143b7dca5d | |||
| 9826d437a5 | |||
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 | ||
|
|
fc7fe0b08e | ||
|
|
3cf75a541a | ||
|
|
1f750d3edd | ||
|
|
b2b0444131 | ||
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 | ||
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac | |||
| 3dbc2184ef | |||
| 60cb8f889a | |||
| c9ee078622 | |||
| ea660500c9 | |||
| d43a9aeec7 | |||
|
|
f5622e351e | ||
|
|
a20806afc8 | ||
|
|
4f9b6b3bcd |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
LISTEN ?= :8080
|
||||
AUDIT_PATH ?=
|
||||
EXPORT_DIR ?= $(CURDIR)/.tmp/export
|
||||
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||
|
||||
RUN_ARGS := web --listen $(LISTEN)
|
||||
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
|
||||
ifneq ($(AUDIT_PATH),)
|
||||
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||
endif
|
||||
@@ -9,10 +12,11 @@ endif
|
||||
.PHONY: run build test
|
||||
|
||||
run:
|
||||
go run ./cmd/bee $(RUN_ARGS)
|
||||
mkdir -p $(EXPORT_DIR)
|
||||
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||
|
||||
build:
|
||||
go build -o bee ./cmd/bee
|
||||
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||
|
||||
test:
|
||||
go test ./...
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -21,30 +22,7 @@ var Version = "dev"
|
||||
func buildLabel() string {
|
||||
label := strings.TrimSpace(Version)
|
||||
if label == "" {
|
||||
label = "dev"
|
||||
}
|
||||
if info, ok := debug.ReadBuildInfo(); ok {
|
||||
var revision string
|
||||
var modified bool
|
||||
for _, setting := range info.Settings {
|
||||
switch setting.Key {
|
||||
case "vcs.revision":
|
||||
revision = setting.Value
|
||||
case "vcs.modified":
|
||||
modified = setting.Value == "true"
|
||||
}
|
||||
}
|
||||
if revision != "" {
|
||||
short := revision
|
||||
if len(short) > 12 {
|
||||
short = short[:12]
|
||||
}
|
||||
label += " (" + short
|
||||
if modified {
|
||||
label += "+"
|
||||
}
|
||||
label += ")"
|
||||
}
|
||||
return "dev"
|
||||
}
|
||||
return label
|
||||
}
|
||||
@@ -53,10 +31,19 @@ func main() {
|
||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||
}
|
||||
|
||||
func run(args []string, stdout, stderr io.Writer) int {
|
||||
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
})))
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
slog.Error("fatal panic",
|
||||
"panic", fmt.Sprint(rec),
|
||||
"stack", string(debug.Stack()),
|
||||
)
|
||||
exitCode = 1
|
||||
}
|
||||
}()
|
||||
|
||||
if len(args) == 0 {
|
||||
printRootUsage(stderr)
|
||||
@@ -82,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark(args[1:], stdout, stderr)
|
||||
case "version", "--version", "-version":
|
||||
fmt.Fprintln(stdout, Version)
|
||||
return 0
|
||||
@@ -98,8 +87,9 @@ func printRootUsage(w io.Writer) {
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee export --target <device>
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -118,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||
case "version":
|
||||
fmt.Fprintln(stdout, "usage: bee version")
|
||||
return 0
|
||||
@@ -304,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
||||
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||
fs.Usage = func() {
|
||||
@@ -390,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||
case "storage":
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
@@ -407,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
slog.Info("sat archive written", "target", target, "path", archive)
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 2
|
||||
}
|
||||
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 0
|
||||
}
|
||||
target := args[0]
|
||||
if target != "nvidia" {
|
||||
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
|
||||
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 2
|
||||
}
|
||||
|
||||
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
|
||||
devices := fs.String("devices", "", "comma-separated GPU indices to include")
|
||||
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
|
||||
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
|
||||
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
|
||||
if err := fs.Parse(args[1:]); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
|
||||
return 2
|
||||
}
|
||||
|
||||
includeIndices, err := parseBenchmarkIndexCSV(*devices)
|
||||
if err != nil {
|
||||
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
|
||||
return 2
|
||||
}
|
||||
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
|
||||
if err != nil {
|
||||
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
|
||||
Profile: *profile,
|
||||
SizeMB: *sizeMB,
|
||||
GPUIndices: includeIndices,
|
||||
ExcludeGPUIndices: excludeIndices,
|
||||
RunNCCL: !*skipNCCL,
|
||||
}, logLine)
|
||||
if err != nil {
|
||||
slog.Error("run benchmark", "target", target, "err", err)
|
||||
return 1
|
||||
}
|
||||
slog.Info("benchmark archive written", "target", target, "path", archive)
|
||||
return 0
|
||||
}
|
||||
|
||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return nil, nil
|
||||
}
|
||||
var indices []int
|
||||
for _, part := range strings.Split(raw, ",") {
|
||||
part = strings.TrimSpace(part)
|
||||
if part == "" {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.Atoi(part)
|
||||
if err != nil || value < 0 {
|
||||
return nil, fmt.Errorf("bad gpu index %q", part)
|
||||
}
|
||||
indices = append(indices, value)
|
||||
}
|
||||
return indices, nil
|
||||
}
|
||||
|
||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunVersion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
old := Version
|
||||
Version = "test-version"
|
||||
t.Cleanup(func() { Version = old })
|
||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||
old := Version
|
||||
Version = "1.2.3"
|
||||
t.Cleanup(func() { Version = old })
|
||||
|
||||
if got := buildLabel(); got != "1.2.3" {
|
||||
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunExportRequiresTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -5,22 +5,18 @@ go 1.25.0
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require (
|
||||
github.com/go-analyze/charts v0.5.26
|
||||
modernc.org/sqlite v1.48.0
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/image v0.24.0 // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/libc v1.72.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
modernc.org/sqlite v1.48.0 // indirect
|
||||
)
|
||||
|
||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
|
||||
@@ -19,17 +19,22 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
|
||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
@@ -40,6 +45,8 @@ type App struct {
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -80,6 +87,8 @@ type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
LiveMediaRAMState() platform.LiveMediaRAMState
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
@@ -100,6 +109,14 @@ func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
|
||||
return a.installer.LiveMediaRAMState()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
@@ -107,9 +124,19 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||
ResetNvidiaGPU(index int) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
@@ -122,7 +149,7 @@ type satRunner interface {
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -131,7 +158,7 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
@@ -140,19 +167,32 @@ func New(platform *platform.System) *App {
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
snap, err := readAuditSnapshot(auditJSON)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return schema.HardwareIngestRequest{}, err
|
||||
}
|
||||
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||
@@ -160,7 +200,8 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -175,10 +216,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
@@ -203,10 +241,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
@@ -272,10 +307,13 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
}
|
||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||
data, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -497,6 +535,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
@@ -509,10 +556,106 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchAutotuneDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||
}
|
||||
|
||||
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||
}
|
||||
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||
if err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
@@ -521,14 +664,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
@@ -553,14 +696,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
@@ -647,8 +790,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
@@ -733,6 +883,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||
var body strings.Builder
|
||||
@@ -767,6 +918,7 @@ func (a *App) MainBanner() string {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ""
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
var lines []string
|
||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||
@@ -843,6 +995,41 @@ func bodyOr(body, fallback string) string {
|
||||
return body
|
||||
}
|
||||
|
||||
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||
// component-status DB so they are visible in the Hardware Summary card.
|
||||
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||
if db == nil || len(psus) == 0 {
|
||||
return
|
||||
}
|
||||
const source = "audit:ipmi"
|
||||
worstStatus := "OK"
|
||||
for _, psu := range psus {
|
||||
if psu.Status == nil {
|
||||
continue
|
||||
}
|
||||
slot := "?"
|
||||
if psu.Slot != nil {
|
||||
slot = *psu.Slot
|
||||
}
|
||||
st := *psu.Status
|
||||
detail := ""
|
||||
if psu.ErrorDescription != nil {
|
||||
detail = *psu.ErrorDescription
|
||||
}
|
||||
db.Record("psu:"+slot, source, st, detail)
|
||||
switch st {
|
||||
case "Critical":
|
||||
worstStatus = "Critical"
|
||||
case "Warning":
|
||||
if worstStatus != "Critical" {
|
||||
worstStatus = "Warning"
|
||||
}
|
||||
}
|
||||
}
|
||||
db.Record("psu:all", source, worstStatus, "")
|
||||
}
|
||||
|
||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
@@ -861,6 +1048,12 @@ func latestSATSummaries() []string {
|
||||
prefix string
|
||||
}{
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -120,15 +121,26 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
}
|
||||
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNCCLFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||
resetNvidiaGPUFn func(int) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
@@ -139,6 +151,62 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaBenchmarkFn != nil {
|
||||
return f.runNvidiaBenchmarkFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerBenchFn != nil {
|
||||
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
|
||||
if f.runNvidiaAutotuneFn != nil {
|
||||
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaComputeFn != nil {
|
||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerFn != nil {
|
||||
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPulseFn != nil {
|
||||
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaBandwidthFn != nil {
|
||||
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaStressFn != nil {
|
||||
return f.runNvidiaStressFn(baseDir, opts)
|
||||
@@ -153,11 +221,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
if f.listNvidiaGPUStatusesFn != nil {
|
||||
return f.listNvidiaGPUStatusesFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||
if f.resetNvidiaGPUFn != nil {
|
||||
return f.resetNvidiaGPUFn(index)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
@@ -215,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNCCLFn != nil {
|
||||
return f.runNCCLFn(baseDir, gpuIndices)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotBaseDir string
|
||||
var gotGPUIndices []int
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||
gotBaseDir = baseDir
|
||||
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||
return "/tmp/nccl-tests.tar.gz", nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("RunNCCLTests error: %v", err)
|
||||
}
|
||||
if path != "/tmp/nccl-tests.tar.gz" {
|
||||
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||
}
|
||||
if gotBaseDir != "/tmp/sat" {
|
||||
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||
}
|
||||
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -478,8 +593,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -516,8 +629,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -579,8 +690,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
@@ -660,18 +769,61 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
raw := `{
|
||||
"collected_at": "2026-03-15T10:00:00Z",
|
||||
"hardware": {
|
||||
"board": {"serial_number": "SRV123"},
|
||||
"storage": [
|
||||
{"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
|
||||
{"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
|
||||
],
|
||||
"pcie_devices": [
|
||||
{"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
|
||||
{"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
|
||||
]
|
||||
}
|
||||
}`
|
||||
|
||||
got, err := ApplySATOverlay([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("ApplySATOverlay error: %v", err)
|
||||
}
|
||||
text := string(got)
|
||||
if contains(text, "Virtual HDisk0") {
|
||||
t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
|
||||
}
|
||||
if contains(text, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
|
||||
}
|
||||
if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
|
||||
t.Fatalf("overlaid audit should keep real devices:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
exportDir := filepath.Join(tmp, "export")
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -698,6 +850,8 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
var manifest string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
@@ -707,6 +861,43 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
if contains(hdr.Name, "/export/bee-audit.json") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read audit entry: %v", err)
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read manifest entry: %v", err)
|
||||
}
|
||||
manifest = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
"/system/ip-link.txt",
|
||||
"/system/ip-link-stats.txt",
|
||||
"/system/kernel-aer-nvidia.txt",
|
||||
"/system/lspci-nvidia-bridges-vv.txt",
|
||||
"/system/pcie-aer-sysfs.txt",
|
||||
"/system/ethtool-info.txt",
|
||||
"/system/ethtool-link.txt",
|
||||
"/system/ethtool-module.txt",
|
||||
"/system/mstflint-query.txt",
|
||||
} {
|
||||
var found bool
|
||||
for _, name := range names {
|
||||
if contains(name, want) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||
}
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
@@ -721,6 +912,18 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(manifest, "files:") {
|
||||
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
|
||||
}
|
||||
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
|
||||
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
@@ -734,6 +937,10 @@ func TestMainBanner(t *testing.T) {
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
memorySerialA := "DIMM-A"
|
||||
memorySerialB := "DIMM-B"
|
||||
storageSerialA := "DISK-A"
|
||||
storageSerialB := "DISK-B"
|
||||
gpuClass := "VideoController"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
@@ -749,12 +956,12 @@ func TestMainBanner(t *testing.T) {
|
||||
{Model: &cpuModel},
|
||||
},
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
|
||||
67
audit/internal/app/atomic_write.go
Normal file
67
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// readFileLimited reads path into memory, refusing files larger than maxBytes.
|
||||
// Prevents OOM on corrupted or unexpectedly large data files.
|
||||
func readFileLimited(path string, maxBytes int64) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > maxBytes {
|
||||
return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||
}
|
||||
|
||||
tmpPath := path + ".tmp"
|
||||
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||
}
|
||||
|
||||
success := false
|
||||
defer func() {
|
||||
_ = f.Close()
|
||||
if !success {
|
||||
_ = os.Remove(tmpPath)
|
||||
}
|
||||
}()
|
||||
|
||||
if _, err := f.Write(data); err != nil {
|
||||
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := os.Rename(tmpPath, path); err != nil {
|
||||
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||
}
|
||||
|
||||
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||
_ = dir.Sync()
|
||||
_ = dir.Close()
|
||||
}
|
||||
|
||||
success = true
|
||||
return nil
|
||||
}
|
||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||
t.Fatalf("seed file: %v", err)
|
||||
}
|
||||
|
||||
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||
t.Fatalf("atomicWriteFile: %v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read final: %v", err)
|
||||
}
|
||||
if string(raw) != "new\n" {
|
||||
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||
a := &App{
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||
return schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: exportDir,
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
}, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got, err := a.RunRuntimePreflight("file:" + path)
|
||||
if err != nil {
|
||||
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||
}
|
||||
if got != path {
|
||||
t.Fatalf("path=%q want %q", got, path)
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read runtime file: %v", err)
|
||||
}
|
||||
var health schema.RuntimeHealth
|
||||
if err := json.Unmarshal(raw, &health); err != nil {
|
||||
t.Fatalf("json unmarshal: %v", err)
|
||||
}
|
||||
if health.Status != "OK" {
|
||||
t.Fatalf("status=%q want OK", health.Status)
|
||||
}
|
||||
}
|
||||
268
audit/internal/app/component_status_db.go
Normal file
268
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,268 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := readFileLimited(path, 10<<20)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
|
||||
"amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
@@ -3,13 +3,14 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
@@ -28,6 +30,102 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type nvidiaPerGPUStatus struct {
|
||||
runStatus string
|
||||
reason string
|
||||
}
|
||||
|
||||
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if devs[i].Telemetry == nil {
|
||||
continue
|
||||
}
|
||||
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
idx, ok := telemetryInt(rawIdx)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
st, ok := statusByIndex[idx]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
runDir := matches[len(matches)-1]
|
||||
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return nil, "", false
|
||||
}
|
||||
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||
for _, file := range files {
|
||||
raw, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[idx] = nvidiaPerGPUStatus{
|
||||
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||
reason: strings.TrimSpace(kv["reason"]),
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
return out, runAtUTC, true
|
||||
}
|
||||
|
||||
func telemetryInt(v any) (int, bool) {
|
||||
switch value := v.(type) {
|
||||
case int:
|
||||
return value, true
|
||||
case int32:
|
||||
return int(value), true
|
||||
case int64:
|
||||
return int(value), true
|
||||
case float64:
|
||||
return int(value), true
|
||||
case string:
|
||||
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
@@ -174,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
newSeverity := statusSeverity(satStatus)
|
||||
currentSeverity := statusSeverity(current)
|
||||
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||
return
|
||||
}
|
||||
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
@@ -206,6 +329,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
|
||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
@@ -53,9 +53,57 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "VideoController"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
bdf0 := "0000:4b:00.0"
|
||||
bdf1 := "0000:4f:00.0"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf0,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||
},
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf1,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||
}
|
||||
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||
got := "<nil>"
|
||||
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||
got = *snap.PCIeDevices[1].ErrorDescription
|
||||
}
|
||||
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bee/audit/internal/platform"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -19,7 +20,11 @@ var supportBundleServices = []string{
|
||||
"bee-network.service",
|
||||
"bee-nvidia.service",
|
||||
"bee-preflight.service",
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
@@ -27,15 +32,217 @@ var supportBundleCommands = []struct {
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi topo -m 2>&1 || true
|
||||
else
|
||||
echo "nvidia-smi not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||
`}},
|
||||
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||
for candidate in \
|
||||
/usr/bin/nvidia-fabricmanager \
|
||||
/usr/bin/nv-fabricmanager \
|
||||
/usr/bin/nvidia-fabricmanagerd \
|
||||
/usr/bin/nvlsm; do
|
||||
if [ -e "$candidate" ]; then
|
||||
echo "=== $candidate ==="
|
||||
ls -l "$candidate" 2>&1 || true
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||
echo "no fabric manager binaries found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||
echo
|
||||
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||
lspci -s "$bridge" -vv 2>&1 || true
|
||||
fi
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no NVIDIA PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
class=$(cat "$d/class" 2>/dev/null)
|
||||
case "$class" in
|
||||
0x030000|0x030200) ;;
|
||||
*) continue ;;
|
||||
esac
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||
found=0
|
||||
for dev in /sys/bus/pci/devices/*; do
|
||||
[ -e "$dev" ] || continue
|
||||
bdf=$(basename "$dev")
|
||||
block=""
|
||||
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||
if [ -r "$dev/$f" ]; then
|
||||
if [ -z "$block" ]; then
|
||||
block=1
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
fi
|
||||
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||
fi
|
||||
done
|
||||
if [ -n "$block" ]; then
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no PCIe AER sysfs counters found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -i "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -m "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v mstflint >/dev/null 2>&1; then
|
||||
echo "mstflint not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/bus/pci/devices/*; do
|
||||
[ -e "$path/vendor" ] || continue
|
||||
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x15b3" ] || continue
|
||||
bdf=$(basename "$path")
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
mstflint -d "$bdf" q 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no Mellanox/NVIDIA networking devices found"
|
||||
fi
|
||||
`}},
|
||||
}
|
||||
|
||||
var supportBundleOptionalFiles = []struct {
|
||||
name string
|
||||
src string
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
@@ -48,9 +255,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
host := sanitizeFilename(hostnameOr("unknown"))
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -75,45 +287,79 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
|
||||
func cleanupOldSupportBundles(dir string) error {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
type entry struct {
|
||||
path string
|
||||
mod time.Time
|
||||
entries := supportBundleEntries(matches)
|
||||
for path, mod := range entries {
|
||||
if time.Since(mod) > 24*time.Hour {
|
||||
_ = os.Remove(path)
|
||||
delete(entries, path)
|
||||
}
|
||||
}
|
||||
list := make([]entry, 0, len(matches))
|
||||
ordered := orderSupportBundles(entries)
|
||||
if len(ordered) > 3 {
|
||||
for _, old := range ordered[3:] {
|
||||
_ = os.Remove(old)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func latestSupportBundlePath(dir string) (string, error) {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||
if len(ordered) == 0 {
|
||||
return "", os.ErrNotExist
|
||||
}
|
||||
return ordered[0], nil
|
||||
}
|
||||
|
||||
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||
entries := make(map[string]time.Time, len(matches))
|
||||
for _, match := range matches {
|
||||
info, err := os.Stat(match)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
||||
_ = os.Remove(match)
|
||||
continue
|
||||
}
|
||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
||||
entries[match] = info.ModTime()
|
||||
}
|
||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
||||
if len(list) > 3 {
|
||||
for _, old := range list[3:] {
|
||||
_ = os.Remove(old.path)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||
ordered := make([]string, 0, len(entries))
|
||||
for path := range entries {
|
||||
ordered = append(ordered, path)
|
||||
}
|
||||
return nil
|
||||
sort.Slice(ordered, func(i, j int) bool {
|
||||
return entries[ordered[i]].After(entries[ordered[j]])
|
||||
})
|
||||
return ordered
|
||||
}
|
||||
|
||||
func writeJournalDump(dst string) error {
|
||||
@@ -152,6 +398,24 @@ func writeCommandOutput(dst string, cmd []string) error {
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func copyOptionalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
@@ -161,6 +425,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
|
||||
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
|
||||
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
|
||||
if strings.TrimSpace(cfg.Reason) != "" {
|
||||
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&body, "\nfiles:\n")
|
||||
|
||||
var files []string
|
||||
@@ -188,6 +459,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||
}
|
||||
|
||||
func bundleVersion() string {
|
||||
v := buildVersion()
|
||||
v = strings.TrimPrefix(v, "v")
|
||||
v = strings.TrimPrefix(v, "V")
|
||||
if v == "" || v == "unknown" {
|
||||
return "0.0"
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func serverModelForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Product Name" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return strings.ReplaceAll(val, " ", "_")
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func serverSerialForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Serial Number" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return val
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func buildVersion() string {
|
||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||
if err != nil {
|
||||
@@ -215,7 +540,7 @@ func copyDirContents(srcDir, dstDir string) error {
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
@@ -227,7 +552,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
|
||||
}
|
||||
|
||||
func normalizeSupportBundleAuditJSON(path string) error {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
normalized, err := ApplySATOverlay(data)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return os.WriteFile(path, normalized, 0644)
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
package collector
|
||||
|
||||
import "bee/audit/internal/schema"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
finalizeSnapshot(snap, collectedAt)
|
||||
}
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
snap.Storage = filterStorage(snap.Storage)
|
||||
snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
|
||||
continue
|
||||
}
|
||||
out = append(out, disk)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
out := make([]schema.HardwarePCIeDevice, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
|
||||
continue
|
||||
}
|
||||
out = append(out, dev)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
|
||||
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
present := true
|
||||
status := statusOK
|
||||
serial := "SN-1"
|
||||
virtualModel := "Virtual HDisk1"
|
||||
realModel := "PASCARI"
|
||||
coProcessorClass := "Co-processor"
|
||||
gpuClass := "VideoController"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Memory: []schema.HardwareMemory{
|
||||
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PowerSupplies: []schema.HardwarePowerSupply{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||
}
|
||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||
}
|
||||
if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
|
||||
t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
|
||||
}
|
||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||
}
|
||||
|
||||
@@ -2,18 +2,21 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
mstflintQuery = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -21,7 +24,7 @@ var (
|
||||
}
|
||||
|
||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -29,6 +32,14 @@ var (
|
||||
}
|
||||
|
||||
netIfacesByBDF = listNetIfacesByBDF
|
||||
readNetCarrierFile = func(iface string) (string, error) {
|
||||
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(string(raw)), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||
@@ -162,3 +173,9 @@ func listNetIfacesByBDF(bdf string) []string {
|
||||
}
|
||||
return ifaces
|
||||
}
|
||||
|
||||
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
return exec.CommandContext(ctx, name, args...).Output()
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
|
||||
var (
|
||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -113,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
}
|
||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case key == "identifier":
|
||||
s := parseSFPIdentifier(val)
|
||||
dev.SFPIdentifier = &s
|
||||
t := true
|
||||
dev.SFPPresent = &t
|
||||
changed = true
|
||||
case key == "connector":
|
||||
s := parseSFPConnector(val)
|
||||
dev.SFPConnector = &s
|
||||
changed = true
|
||||
case key == "vendor name":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPVendor = &s
|
||||
changed = true
|
||||
case key == "vendor pn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPPartNumber = &s
|
||||
changed = true
|
||||
case key == "vendor sn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPSerialNumber = &s
|
||||
changed = true
|
||||
case strings.Contains(key, "laser wavelength"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPWavelengthNM = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "module temperature"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPTemperatureC = &f
|
||||
@@ -145,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
return changed
|
||||
}
|
||||
|
||||
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||
func parseSFPIdentifier(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||
// e.g. "0x07 (LC)" → "LC".
|
||||
func parseSFPConnector(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||
|
||||
func extractParens(s string) string {
|
||||
m := parenRe.FindStringSubmatch(s)
|
||||
if len(m) < 2 {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(m[1])
|
||||
}
|
||||
|
||||
func parseSFPDOM(raw string) map[string]any {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := map[string]any{}
|
||||
if dev.SFPPresent != nil {
|
||||
out["sfp_present"] = *dev.SFPPresent
|
||||
}
|
||||
if dev.SFPIdentifier != nil {
|
||||
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||
}
|
||||
if dev.SFPConnector != nil {
|
||||
out["sfp_connector"] = *dev.SFPConnector
|
||||
}
|
||||
if dev.SFPVendor != nil {
|
||||
out["sfp_vendor"] = *dev.SFPVendor
|
||||
}
|
||||
if dev.SFPPartNumber != nil {
|
||||
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||
}
|
||||
if dev.SFPSerialNumber != nil {
|
||||
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||
}
|
||||
if dev.SFPWavelengthNM != nil {
|
||||
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||
}
|
||||
if dev.SFPTemperatureC != nil {
|
||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||
}
|
||||
|
||||
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
}
|
||||
return "aa:bb:cc:dd:ee:ff", nil
|
||||
}
|
||||
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||
|
||||
@@ -101,6 +104,39 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDBMValue(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
|
||||
@@ -13,14 +13,20 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
Index int
|
||||
BDF string
|
||||
Name string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
@@ -68,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
continue
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(info.Name); v != "" {
|
||||
devs[i].Model = &v
|
||||
}
|
||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||
devs[i].SerialNumber = &v
|
||||
}
|
||||
@@ -94,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -118,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 14 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -128,14 +137,20 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
Index: parseRequiredInt(rec[0]),
|
||||
BDF: bdf,
|
||||
Name: strings.TrimSpace(rec[2]),
|
||||
Serial: strings.TrimSpace(rec[3]),
|
||||
VBIOS: strings.TrimSpace(rec[4]),
|
||||
TemperatureC: parseMaybeFloat(rec[5]),
|
||||
PowerW: parseMaybeFloat(rec[6]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||
HWSlowdown: parseMaybeBool(rec[9]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -167,6 +182,30 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseRequiredInt(v string) int {
|
||||
n, err := strconv.Atoi(strings.TrimSpace(v))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -216,6 +255,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
dev.Telemetry["nvidia_gpu_index"] = info.Index
|
||||
if info.TemperatureC != nil {
|
||||
dev.TemperatureC = info.TemperatureC
|
||||
}
|
||||
@@ -231,4 +274,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatalf("gpu by normalized bdf not found")
|
||||
}
|
||||
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||
t.Fatalf("name: got %q", gpu.Name)
|
||||
}
|
||||
if gpu.Serial != "GPU-SERIAL-1" {
|
||||
t.Fatalf("serial: got %q", gpu.Serial)
|
||||
}
|
||||
@@ -28,6 +31,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
@@ -80,6 +89,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||
}
|
||||
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
|
||||
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||
t.Fatalf("status: got %v", out[0].Status)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
@@ -59,6 +60,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
"host bridge",
|
||||
"isa bridge",
|
||||
"pci bridge",
|
||||
"co-processor",
|
||||
"performance counter",
|
||||
"performance counters",
|
||||
"ram memory",
|
||||
@@ -78,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||
bmcPatterns := []string{
|
||||
"management system chip",
|
||||
"management controller",
|
||||
"ibmc",
|
||||
"idrac",
|
||||
"ilo vga",
|
||||
"aspeed",
|
||||
"matrox",
|
||||
}
|
||||
for _, bad := range bmcPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
@@ -152,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
}
|
||||
|
||||
@@ -221,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
||||
// Returns 0 for unrecognised values so comparisons fail safe.
|
||||
func pcieLinkSpeedRank(gen string) int {
|
||||
switch gen {
|
||||
case "Gen1":
|
||||
return 1
|
||||
case "Gen2":
|
||||
return 2
|
||||
case "Gen3":
|
||||
return 3
|
||||
case "Gen4":
|
||||
return 4
|
||||
case "Gen5":
|
||||
return 5
|
||||
case "Gen6":
|
||||
return 6
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func normalizePCILinkSpeed(raw string) string {
|
||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||
switch {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -19,6 +20,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "co-processor", class: "Co-processor", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
@@ -28,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
@@ -76,6 +80,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersCoProcessors(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "H100" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
@@ -124,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
|
||||
ptr := func(s string) *string { return &s }
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
linkSpeed *string
|
||||
maxSpeed *string
|
||||
wantWarning bool
|
||||
wantGenIn string // substring expected in ErrorDescription when warning
|
||||
}{
|
||||
{
|
||||
name: "degraded Gen1 vs Gen5",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen1",
|
||||
},
|
||||
{
|
||||
name: "at max Gen5",
|
||||
linkSpeed: ptr("Gen5"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "degraded Gen4 vs Gen5",
|
||||
linkSpeed: ptr("Gen4"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen4",
|
||||
},
|
||||
{
|
||||
name: "missing current speed — no warning",
|
||||
linkSpeed: nil,
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "missing max speed — no warning",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: nil,
|
||||
wantWarning: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
ok := statusOK
|
||||
dev.Status = &ok
|
||||
dev.LinkSpeed = tt.linkSpeed
|
||||
dev.MaxLinkSpeed = tt.maxSpeed
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
gotWarn := dev.Status != nil && *dev.Status == statusWarning
|
||||
if gotWarn != tt.wantWarning {
|
||||
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
|
||||
}
|
||||
if tt.wantWarning {
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("expected ErrorDescription to be set")
|
||||
}
|
||||
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
|
||||
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
|
||||
}
|
||||
} else {
|
||||
if dev.ErrorDescription != nil {
|
||||
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,11 +160,57 @@ type psuSDR struct {
|
||||
}
|
||||
|
||||
var psuSlotPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
||||
// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
|
||||
// does not fire after the digit; match explicitly with underscore terminator.
|
||||
regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
||||
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
||||
// Must be last: "power supply N" is already caught by the pattern above.
|
||||
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
||||
}
|
||||
|
||||
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_IN, PSU1_PIN
|
||||
// MLT: PSU1_PIN
|
||||
// xFusion: (matched via default fallback — no explicit keyword)
|
||||
// HPE: PS1 Input Power, PS1 Input Watts
|
||||
func isPSUInputPower(name string) bool {
|
||||
return strings.Contains(name, "input power") ||
|
||||
strings.Contains(name, "input watts") ||
|
||||
strings.Contains(name, "_pin") ||
|
||||
strings.Contains(name, " pin") ||
|
||||
strings.Contains(name, "_power_in") ||
|
||||
strings.Contains(name, "power_in")
|
||||
}
|
||||
|
||||
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_OUT
|
||||
// MLT: PSU1_POUT
|
||||
// xFusion: PS1 POut
|
||||
func isPSUOutputPower(name string) bool {
|
||||
return strings.Contains(name, "output power") ||
|
||||
strings.Contains(name, "output watts") ||
|
||||
strings.Contains(name, "_pout") ||
|
||||
strings.Contains(name, " pout") ||
|
||||
strings.Contains(name, "_power_out") ||
|
||||
strings.Contains(name, "power_out") ||
|
||||
strings.Contains(name, "power supply bay") ||
|
||||
strings.Contains(name, "psu bay")
|
||||
}
|
||||
|
||||
// parseBoundedFloat parses a numeric value from an SDR value field and
|
||||
// validates it is within (0, max]. Returns nil for zero, negative, or
|
||||
// out-of-range values — these indicate missing/off/fault sensor readings.
|
||||
func parseBoundedFloat(raw string, max float64) *float64 {
|
||||
v := parseFloatPtr(raw)
|
||||
if v == nil || *v <= 0 || *v > max {
|
||||
return nil
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
@@ -194,24 +240,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
|
||||
lowerName := strings.ToLower(name)
|
||||
switch {
|
||||
case strings.Contains(lowerName, "input power"):
|
||||
entry.inputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "output power"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case isPSUInputPower(lowerName):
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
case isPSUOutputPower(lowerName):
|
||||
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||
entry.inputVoltage = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "temp"):
|
||||
entry.temperatureC = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||
entry.healthPct = parsePercentPtr(value)
|
||||
default:
|
||||
// Generic PSU power reading: sensor matched a slot pattern but carries
|
||||
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
||||
// AC input if the value looks like wattage and no better data is set yet.
|
||||
if entry.inputPowerW == nil {
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
}
|
||||
}
|
||||
out[slot] = entry
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// PSUSlotPower holds SDR power readings for one PSU slot.
|
||||
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
||||
// matching HardwarePowerSupply.Slot in the audit schema.
|
||||
type PSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"`
|
||||
OutputW *float64 `json:"output_w,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
||||
// using the same battle-tested slot patterns as the hardware audit collector.
|
||||
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
||||
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
||||
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
||||
sdr := parsePSUSDR(sdrOutput)
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]PSUSlotPower, len(sdr))
|
||||
for slot, entry := range sdr {
|
||||
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
||||
out[key] = PSUSlotPower{
|
||||
InputW: entry.inputPowerW,
|
||||
OutputW: entry.outputPowerW,
|
||||
Status: entry.status,
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
|
||||
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
{name: "PWS1 Status", want: 1},
|
||||
{name: "Power Supply Bay 8", want: 8},
|
||||
{name: "PS 6 Input Power", want: 6},
|
||||
// MSI underscore format — \b does not fire between digit and '_'
|
||||
{name: "PSU1_POWER_IN", want: 1},
|
||||
{name: "PSU2_POWER_OUT", want: 2},
|
||||
{name: "PSU4_STATUS", want: 4},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePSUSDRMSIFormat(t *testing.T) {
|
||||
t.Parallel()
|
||||
raw := `
|
||||
PSU1_STATUS | F1h | ok
|
||||
PSU1_POWER_OUT | 928 Watts | ok
|
||||
PSU1_POWER_IN | 976 Watts | ok
|
||||
PSU2_STATUS | F2h | ok
|
||||
PSU2_POWER_OUT | 944 Watts | ok
|
||||
PSU2_POWER_IN | 992 Watts | ok
|
||||
`
|
||||
got := parsePSUSDR(raw)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d want 2", len(got))
|
||||
}
|
||||
if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
|
||||
t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
|
||||
}
|
||||
if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
|
||||
t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
|
||||
}
|
||||
if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
|
||||
t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
if isVirtualBMCDisk(dev) {
|
||||
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||
continue
|
||||
}
|
||||
disks = append(disks, dev)
|
||||
}
|
||||
return disks
|
||||
}
|
||||
|
||||
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||
// These have zero reported size, a generic fake serial, and a model name that
|
||||
// starts with "Virtual HDisk".
|
||||
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||
return isVirtualHDiskModel(dev.Model)
|
||||
}
|
||||
|
||||
func isVirtualHDiskModel(model string) bool {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
return strings.HasPrefix(model, "virtual hdisk")
|
||||
}
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
|
||||
4636
audit/internal/platform/benchmark.go
Normal file
4636
audit/internal/platform/benchmark.go
Normal file
File diff suppressed because it is too large
Load Diff
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
@@ -0,0 +1,735 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
benchmarkPowerAutotuneVersion = 1
|
||||
benchmarkPowerAutotuneIdleSec = 60
|
||||
benchmarkPowerAutotuneLoadSec = 90
|
||||
benchmarkPowerAutotuneSampleInterval = 3
|
||||
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
||||
)
|
||||
|
||||
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
||||
baseDir = strings.TrimSpace(baseDir)
|
||||
if baseDir == "" {
|
||||
return defaultBenchmarkPowerSourceConfigPath
|
||||
}
|
||||
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
||||
}
|
||||
|
||||
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var cfg BenchmarkPowerAutotuneConfig
|
||||
if err := json.Unmarshal(raw, &cfg); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
||||
return nil, fmt.Errorf("autotune config missing selected_source")
|
||||
}
|
||||
return &cfg, nil
|
||||
}
|
||||
|
||||
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if cfg.Version <= 0 {
|
||||
cfg.Version = benchmarkPowerAutotuneVersion
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := json.MarshalIndent(cfg, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
}
|
||||
|
||||
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
||||
}
|
||||
|
||||
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return fmt.Errorf("empty autotune config path")
|
||||
}
|
||||
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func normalizeBenchmarkPowerSource(source string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(source)) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
return BenchmarkPowerSourceSDRPSUInput
|
||||
default:
|
||||
return BenchmarkPowerSourceDCMI
|
||||
}
|
||||
}
|
||||
|
||||
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
||||
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
||||
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
||||
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: true,
|
||||
SelectedSource: selected,
|
||||
EffectiveSource: selected,
|
||||
Mode: "autotuned",
|
||||
Reason: strings.TrimSpace(cfg.Reason),
|
||||
ConfiguredAt: cfg.UpdatedAt,
|
||||
}
|
||||
}
|
||||
|
||||
sources := sampleBenchmarkPowerSources()
|
||||
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
||||
}
|
||||
}
|
||||
return SystemPowerSourceDecision{
|
||||
Configured: false,
|
||||
EffectiveSource: BenchmarkPowerSourceDCMI,
|
||||
Mode: "fallback",
|
||||
Reason: "autotune config not found; using temporary fallback source dcmi",
|
||||
}
|
||||
}
|
||||
|
||||
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
||||
decision := ResolveSystemPowerDecision(exportDir)
|
||||
if decision.EffectiveSource != "" {
|
||||
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
||||
return value, decision, nil
|
||||
} else if decision.Configured {
|
||||
fallback := BenchmarkPowerSourceDCMI
|
||||
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
||||
fallback = BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
||||
decision.EffectiveSource = fallback
|
||||
return fallbackValue, decision, nil
|
||||
}
|
||||
decision.Mode = "degraded"
|
||||
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
||||
return 0, decision, err
|
||||
}
|
||||
}
|
||||
return 0, decision, fmt.Errorf("system power source unavailable")
|
||||
}
|
||||
|
||||
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
||||
switch normalizeBenchmarkPowerSource(source) {
|
||||
case BenchmarkPowerSourceSDRPSUInput:
|
||||
sdr := sampleIPMISDRPowerSensors()
|
||||
if sdr.PSUInW > 0 {
|
||||
return sdr.PSUInW, nil
|
||||
}
|
||||
return 0, fmt.Errorf("sdr psu input unavailable")
|
||||
default:
|
||||
return queryIPMIServerPowerW()
|
||||
}
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSources() map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceDCMI] = w
|
||||
}
|
||||
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
||||
out[BenchmarkPowerSourceSDRPSUInput] = w
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
||||
if durationSec <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||
}
|
||||
close(stopCh)
|
||||
return <-doneCh
|
||||
}
|
||||
|
||||
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
||||
if intervalSec <= 0 {
|
||||
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||
}
|
||||
ch := make(chan []float64, 1)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
var samples []float64
|
||||
record := func() {
|
||||
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
}
|
||||
record()
|
||||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
ch <- samples
|
||||
return
|
||||
case <-ticker.C:
|
||||
record()
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
type benchmarkPowerAutotuneSample struct {
|
||||
ElapsedSec float64
|
||||
GPUAvgUsagePct float64
|
||||
CPUUsagePct float64
|
||||
GPUSumPowerW float64
|
||||
Sources map[string]float64
|
||||
}
|
||||
|
||||
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
||||
if durationSec <= 0 {
|
||||
return nil
|
||||
}
|
||||
var out []benchmarkPowerAutotuneSample
|
||||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||
start := time.Now()
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return out
|
||||
}
|
||||
row := benchmarkPowerAutotuneSample{
|
||||
ElapsedSec: time.Since(start).Seconds(),
|
||||
CPUUsagePct: sampleCPULoadPct(),
|
||||
Sources: sampleBenchmarkPowerSources(),
|
||||
}
|
||||
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
||||
var usageSum float64
|
||||
for _, gpu := range gpuRows {
|
||||
row.GPUSumPowerW += gpu.PowerW
|
||||
usageSum += gpu.UsagePct
|
||||
}
|
||||
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
||||
}
|
||||
out = append(out, row)
|
||||
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
||||
if time.Now().After(deadline) {
|
||||
return out
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return out
|
||||
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
if value, ok := sample.Sources[source]; ok && value > 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
||||
} else {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
||||
}
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
||||
phase,
|
||||
sample.ElapsedSec,
|
||||
sample.GPUAvgUsagePct,
|
||||
sample.GPUSumPowerW,
|
||||
sample.CPUUsagePct,
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||
if logFunc == nil || len(samples) == 0 {
|
||||
return
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
var gpuPower []float64
|
||||
sourceBuckets := map[string][]float64{}
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
sourceBuckets[source] = append(sourceBuckets[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
var sourceParts []string
|
||||
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||
values := sourceBuckets[source]
|
||||
if len(values) == 0 {
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
||||
continue
|
||||
}
|
||||
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
||||
phase,
|
||||
len(samples),
|
||||
benchmarkMean(gpuUsage),
|
||||
benchmarkPercentile(gpuUsage, 95),
|
||||
benchmarkMean(gpuPower),
|
||||
benchmarkMean(cpuUsage),
|
||||
benchmarkPercentile(cpuUsage, 95),
|
||||
strings.Join(sourceParts, " "),
|
||||
))
|
||||
}
|
||||
|
||||
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
||||
if logFunc == nil {
|
||||
return
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
if !candidate.Available {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
||||
continue
|
||||
}
|
||||
logFunc(fmt.Sprintf(
|
||||
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
||||
candidate.Source,
|
||||
candidate.IdleAvgW,
|
||||
candidate.LoadAvgW,
|
||||
candidate.DeltaW,
|
||||
gpuDelta,
|
||||
candidate.RelativeError,
|
||||
candidate.Confidence*100,
|
||||
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
||||
))
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
||||
result := &BenchmarkPowerAutotuneValidation{}
|
||||
if len(samples) == 0 {
|
||||
result.Reason = "no idle telemetry samples collected"
|
||||
return result
|
||||
}
|
||||
var gpuUsage []float64
|
||||
var cpuUsage []float64
|
||||
for _, sample := range samples {
|
||||
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||
if sample.CPUUsagePct > 0 {
|
||||
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||
}
|
||||
}
|
||||
result.GPUSamples = len(gpuUsage)
|
||||
result.CPUSamples = len(cpuUsage)
|
||||
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
||||
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
||||
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
||||
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
||||
switch {
|
||||
case result.GPUAvgUsagePct > 5:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
||||
case result.GPUP95UsagePct > 10:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
||||
case result.CPUAvgUsagePct > 20:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
||||
case result.CPUP95UsagePct > 35:
|
||||
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
||||
default:
|
||||
result.Valid = true
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
||||
idleBySource := map[string][]float64{}
|
||||
loadBySource := map[string][]float64{}
|
||||
var idleGPU []float64
|
||||
var loadGPU []float64
|
||||
for _, sample := range idle {
|
||||
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
idleBySource[source] = append(idleBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, sample := range load {
|
||||
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
||||
for source, value := range sample.Sources {
|
||||
if value > 0 {
|
||||
loadBySource[source] = append(loadBySource[source], value)
|
||||
}
|
||||
}
|
||||
}
|
||||
idleGPUAvg := benchmarkMean(idleGPU)
|
||||
loadGPUAvg := benchmarkMean(loadGPU)
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
|
||||
candidates := []BenchmarkPowerAutotuneCandidate{
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
||||
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
||||
}
|
||||
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Available && candidate.DeltaW > 0 {
|
||||
available = append(available, candidate)
|
||||
}
|
||||
}
|
||||
if len(available) == 0 {
|
||||
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
||||
}
|
||||
sort.Slice(available, func(i, j int) bool {
|
||||
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
||||
if available[i].Source != available[j].Source {
|
||||
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
||||
}
|
||||
}
|
||||
if available[i].RelativeError != available[j].RelativeError {
|
||||
return available[i].RelativeError < available[j].RelativeError
|
||||
}
|
||||
return available[i].Samples > available[j].Samples
|
||||
})
|
||||
selected := available[0]
|
||||
for idx := range candidates {
|
||||
if candidates[idx].Source == selected.Source {
|
||||
candidates[idx].Selected = true
|
||||
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
||||
}
|
||||
}
|
||||
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
||||
}
|
||||
|
||||
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
||||
candidate := BenchmarkPowerAutotuneCandidate{
|
||||
Source: source,
|
||||
Available: len(idle) > 0 && len(load) > 0,
|
||||
Samples: minInt(len(idle), len(load)),
|
||||
}
|
||||
if !candidate.Available {
|
||||
return candidate
|
||||
}
|
||||
candidate.IdleAvgW = benchmarkMean(idle)
|
||||
candidate.LoadAvgW = benchmarkMean(load)
|
||||
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
||||
if gpuDelta > 0 {
|
||||
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
||||
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
||||
}
|
||||
return candidate
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
||||
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
||||
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
||||
}
|
||||
if result.IdleValidation != nil {
|
||||
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
}
|
||||
for _, candidate := range result.Candidates {
|
||||
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
||||
if candidate.Available {
|
||||
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
||||
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
||||
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
||||
var b strings.Builder
|
||||
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
||||
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
||||
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
||||
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
||||
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
||||
if result.SelectedSource != "" {
|
||||
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if result.IdleValidation != nil {
|
||||
b.WriteString("## Idle Validation\n\n")
|
||||
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
||||
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
||||
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
||||
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
||||
if result.IdleValidation.Reason != "" {
|
||||
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.Candidates) > 0 {
|
||||
b.WriteString("## Candidates\n\n")
|
||||
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
||||
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
||||
for _, candidate := range result.Candidates {
|
||||
if !candidate.Available {
|
||||
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
||||
continue
|
||||
}
|
||||
selected := "no"
|
||||
if candidate.Selected {
|
||||
selected = "yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
||||
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
||||
allDevices := joinIndexList(gpuIndices)
|
||||
switch strings.TrimSpace(strings.ToLower(kind)) {
|
||||
case "power-fit", "power", "nvidia-bench-power":
|
||||
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
||||
if err == nil {
|
||||
return cmd, "power-fit"
|
||||
}
|
||||
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
||||
default:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", fmt.Sprintf("%d", durationSec),
|
||||
"--devices", allDevices,
|
||||
}
|
||||
if sizeMB > 0 {
|
||||
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
||||
}
|
||||
return cmd, "performance"
|
||||
}
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = "/var/log/bee-bench/autotune"
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
hostname, _ := os.Hostname()
|
||||
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
||||
result := BenchmarkPowerAutotuneResult{
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Hostname: hostname,
|
||||
ServerModel: readServerModel(),
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
Status: "FAILED",
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
||||
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
||||
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
||||
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
||||
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
||||
if result.IdleValidation != nil {
|
||||
result.IdleValidationError = result.IdleValidation.Reason
|
||||
logFunc(result.IdleValidation.Reason)
|
||||
}
|
||||
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
||||
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
||||
go func() {
|
||||
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
||||
}()
|
||||
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
||||
loadSamples := <-loadSamplesCh
|
||||
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
||||
if runErr != nil {
|
||||
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
||||
}
|
||||
|
||||
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
||||
result.Candidates = candidates
|
||||
result.GPUPowerIdleW = idleGPUAvg
|
||||
result.GPUPowerLoadW = loadGPUAvg
|
||||
if chooseErr != nil {
|
||||
result.Notes = append(result.Notes, chooseErr.Error())
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, chooseErr
|
||||
}
|
||||
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||
if gpuDelta <= 0 {
|
||||
gpuDelta = loadGPUAvg
|
||||
}
|
||||
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
||||
result.SelectedSource = selectedSource
|
||||
result.Status = "OK"
|
||||
var confidence float64
|
||||
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
||||
for _, candidate := range candidates {
|
||||
if candidate.Selected {
|
||||
confidence = candidate.Confidence
|
||||
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||
selectionReason = candidate.SelectionNotes
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
cfg := BenchmarkPowerAutotuneConfig{
|
||||
Version: benchmarkPowerAutotuneVersion,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
SelectedSource: selectedSource,
|
||||
BenchmarkKind: normalizedKind,
|
||||
Profile: opts.Profile,
|
||||
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
Confidence: confidence,
|
||||
Reason: selectionReason,
|
||||
}
|
||||
result.Config = &cfg
|
||||
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
||||
result.Status = "FAILED"
|
||||
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
||||
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
return runDir, err
|
||||
}
|
||||
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
||||
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
||||
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal autotune result: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||
return fmt.Errorf("write autotune result.json: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune summary.txt: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
||||
return fmt.Errorf("write autotune report.md: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func minInt(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
var _ = exec.ErrNotFound
|
||||
558
audit/internal/platform/benchmark_report.go
Normal file
558
audit/internal/platform/benchmark_report.go
Normal file
@@ -0,0 +1,558 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||
return renderBenchmarkReportWithCharts(result)
|
||||
}
|
||||
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
|
||||
// ── Header ────────────────────────────────────────────────────────────────
|
||||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||
|
||||
// System identity block
|
||||
if result.ServerModel != "" {
|
||||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||
}
|
||||
if result.Hostname != "" {
|
||||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||
}
|
||||
// GPU models summary
|
||||
if len(result.GPUs) > 0 {
|
||||
modelCount := make(map[string]int)
|
||||
var modelOrder []string
|
||||
for _, g := range result.GPUs {
|
||||
m := strings.TrimSpace(g.Name)
|
||||
if m == "" {
|
||||
m = "Unknown GPU"
|
||||
}
|
||||
if modelCount[m] == 0 {
|
||||
modelOrder = append(modelOrder, m)
|
||||
}
|
||||
modelCount[m]++
|
||||
}
|
||||
var parts []string
|
||||
for _, m := range modelOrder {
|
||||
if modelCount[m] == 1 {
|
||||
parts = append(parts, m)
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||
if result.RampRunID != "" {
|
||||
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||||
}
|
||||
} else if result.ParallelGPUs {
|
||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||
}
|
||||
if result.ScalabilityScore > 0 {
|
||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||
}
|
||||
if result.PlatformPowerScore > 0 {
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||
if len(result.Findings) > 0 {
|
||||
b.WriteString("## Executive Summary\n\n")
|
||||
for _, finding := range result.Findings {
|
||||
fmt.Fprintf(&b, "- %s\n", finding)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
if len(result.Warnings) > 0 {
|
||||
b.WriteString("## Warnings\n\n")
|
||||
for _, warning := range result.Warnings {
|
||||
fmt.Fprintf(&b, "- %s\n", warning)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||||
b.WriteString("## Balanced Scorecard\n\n")
|
||||
|
||||
// Perspective 1: Compatibility — hard stops
|
||||
b.WriteString("### 1. Compatibility\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fanAtThrottle := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
ecc := "-"
|
||||
if gpu.ECC.Uncorrected > 0 {
|
||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||
}
|
||||
compatStatus := "✓ OK"
|
||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||
compatStatus = "⛔ HARD STOP"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||
fmt.Sprintf("%.1f°C", headroom),
|
||||
throttlePct,
|
||||
thermalStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 3: Power delivery
|
||||
b.WriteString("### 3. Power Delivery\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||
}
|
||||
fanDuty := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
powerStatus := "✓ OK"
|
||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||
powerStatus = "⚠ POWER LIMITED"
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
powerCap,
|
||||
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||
fanDuty,
|
||||
powerStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 4: Performance
|
||||
b.WriteString("### 4. Performance\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||
synthetic, mixed, mixedEff, topsPerSM,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 5: Anomaly flags
|
||||
b.WriteString("### 5. Anomalies\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||
}
|
||||
syncBoost := "-"
|
||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
}
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||
|
||||
// Identity
|
||||
if gpu.BusID != "" {
|
||||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||
}
|
||||
if gpu.VBIOS != "" {
|
||||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||
}
|
||||
if gpu.ComputeCapability != "" {
|
||||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||
}
|
||||
if gpu.MultiprocessorCount > 0 {
|
||||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||
}
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.PowerLimitDerated {
|
||||
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||||
}
|
||||
if gpu.CalibratedPeakPowerW > 0 {
|
||||
if gpu.CalibratedPeakTempC > 0 {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||
}
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"", "Avg", "P95"},
|
||||
[][]string{
|
||||
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||
}
|
||||
|
||||
// Per-precision stability phases.
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
b.WriteString("**Per-precision stability:**\n\n")
|
||||
var precRows [][]string
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
eccCorr := "—"
|
||||
eccUncorr := "—"
|
||||
if !p.ECC.IsZero() {
|
||||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||
}
|
||||
status := p.Status
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = "OK"
|
||||
}
|
||||
precRows = append(precRows, []string{
|
||||
p.Precision, status,
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||
eccCorr, eccUncorr,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
// Legacy: show combined-window variance.
|
||||
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||||
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||||
}
|
||||
|
||||
// ECC summary
|
||||
if !gpu.ECC.IsZero() {
|
||||
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||||
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||||
}
|
||||
|
||||
// Throttle
|
||||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||
if throttle != "none" {
|
||||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||
}
|
||||
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
var presRows [][]string
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
presRows = append(presRows, []string{
|
||||
p.Name,
|
||||
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||
fmt.Sprintf("×%.3g", p.Weight),
|
||||
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||
fmt.Sprintf("%d", p.Lanes),
|
||||
fmt.Sprintf("%d", p.Iterations),
|
||||
})
|
||||
} else {
|
||||
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||
}
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Degradation / Notes
|
||||
if len(gpu.DegradationReasons) > 0 {
|
||||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||
}
|
||||
if len(gpu.Notes) > 0 {
|
||||
b.WriteString("**Notes:**\n\n")
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||
if result.Interconnect != nil {
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Avg", "Max"},
|
||||
[][]string{
|
||||
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(result.Interconnect.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power ───────────────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
title := "## Server Power\n\n"
|
||||
if sp.Source != "" {
|
||||
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||||
}
|
||||
b.WriteString(title)
|
||||
if !sp.Available {
|
||||
b.WriteString("Server power measurement unavailable.\n\n")
|
||||
} else {
|
||||
spRows := [][]string{
|
||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||
}
|
||||
if sp.ReportingRatio > 0 {
|
||||
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(sp.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||
for _, issue := range result.PSUIssues {
|
||||
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
if cooling.Available {
|
||||
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||
if cooling.FanDutyCycleAvailable {
|
||||
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Value"},
|
||||
[][]string{
|
||||
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||
{"Average fan duty cycle", dutyAvg},
|
||||
{"P95 fan duty cycle", dutyP95},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||
}
|
||||
for _, note := range cooling.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(cooling.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||
var scalRows [][]string
|
||||
for _, step := range result.PerformanceRampSteps {
|
||||
scalRows = append(scalRows, []string{
|
||||
fmt.Sprintf("%d", step.StepIndex),
|
||||
joinIndexList(step.GPUIndices),
|
||||
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||||
if result.Interconnect != nil {
|
||||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||
// duration is unknown (0), raw seconds are shown instead.
|
||||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||
type counter struct {
|
||||
label string
|
||||
us uint64
|
||||
}
|
||||
counters := []counter{
|
||||
{"sw_power", t.SWPowerCapUS},
|
||||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||
{"sync_boost", t.SyncBoostUS},
|
||||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||
}
|
||||
var parts []string
|
||||
for _, c := range counters {
|
||||
if c.us == 0 {
|
||||
continue
|
||||
}
|
||||
sec := float64(c.us) / 1e6
|
||||
if steadyDurationSec > 0 {
|
||||
pct := sec / steadyDurationSec * 100
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||
} else if sec < 1 {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||
}
|
||||
}
|
||||
if len(parts) == 0 {
|
||||
return "none"
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||||
var best float64
|
||||
for i, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||||
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||||
if i == 0 || gpu.Scores.CompositeScore > best {
|
||||
best = gpu.Scores.CompositeScore
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||||
if result.Interconnect != nil {
|
||||
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||||
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||
// is readable as plain text without a markdown renderer.
|
||||
//
|
||||
// headers contains the column header strings.
|
||||
// rows contains data rows; each row must have the same number of cells as headers.
|
||||
// Cells with fewer entries than headers are treated as empty.
|
||||
func fmtMDTable(headers []string, rows [][]string) string {
|
||||
ncols := len(headers)
|
||||
if ncols == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute max width per column.
|
||||
widths := make([]int, ncols)
|
||||
for i, h := range headers {
|
||||
if len(h) > widths[i] {
|
||||
widths[i] = len(h)
|
||||
}
|
||||
}
|
||||
for _, row := range rows {
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
if len(cell) > widths[i] {
|
||||
widths[i] = len(cell)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header row.
|
||||
b.WriteByte('|')
|
||||
for i, h := range headers {
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(h)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Separator row.
|
||||
b.WriteByte('|')
|
||||
for i := range headers {
|
||||
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||
b.WriteByte('|')
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Data rows.
|
||||
for _, row := range rows {
|
||||
b.WriteByte('|')
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(cell)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
576
audit/internal/platform/benchmark_test.go
Normal file
576
audit/internal/platform/benchmark_test.go
Normal file
@@ -0,0 +1,576 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
profile string
|
||||
want benchmarkProfileSpec
|
||||
}{
|
||||
{
|
||||
name: "default",
|
||||
profile: "",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "stability",
|
||||
profile: "stability",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "overnight",
|
||||
profile: "overnight",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := resolveBenchmarkProfile(tc.profile)
|
||||
if got != tc.want {
|
||||
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if len(labels) != 5 || len(phases) != 5 {
|
||||
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
|
||||
}
|
||||
if basePhaseSec != 60 {
|
||||
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 300 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||
}
|
||||
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 300 {
|
||||
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 3600 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 3600 {
|
||||
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 14400 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
phases := []benchmarkPlannedPhase{
|
||||
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||
}
|
||||
rows := []GPUMetricRow{
|
||||
{ElapsedSec: 5},
|
||||
{ElapsedSec: 15},
|
||||
{ElapsedSec: 25},
|
||||
{ElapsedSec: 65},
|
||||
}
|
||||
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||
if len(got["fp8"]) != 1 {
|
||||
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||
}
|
||||
if len(got["fp16"]) != 1 {
|
||||
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||
}
|
||||
if len(got["mixed"]) != 2 {
|
||||
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
wantStatus string
|
||||
}{
|
||||
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
|
||||
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
|
||||
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
|
||||
if got != tc.wantStatus {
|
||||
t.Fatalf("status=%q want %q", got, tc.wantStatus)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
before := BenchmarkThrottleCounters{}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||
}
|
||||
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldExec := satExecCommand
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
t.Fatalf("unexpected command: %s %v", name, args)
|
||||
return nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
satExecCommand = oldExec
|
||||
})
|
||||
|
||||
var logs []string
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||
logs = append(logs, line)
|
||||
})
|
||||
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||
t.Fatalf("logs=%q want substring %q", got, want)
|
||||
}
|
||||
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||
t.Fatalf("failed=%v want [0 2]", failed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "nvidia-smi")
|
||||
argsLog := filepath.Join(dir, "args.log")
|
||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldLookPath := satLookPath
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "nvidia-smi" {
|
||||
return script, nil
|
||||
}
|
||||
return exec.LookPath(file)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
satLookPath = oldLookPath
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
raw, err := os.ReadFile(argsLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read args log: %v", err)
|
||||
}
|
||||
got := strings.Fields(string(raw))
|
||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||
t.Fatalf("args=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
|
||||
Profile: "stability",
|
||||
RunNCCL: false,
|
||||
})
|
||||
if opts.Profile != NvidiaBenchmarkProfileStability {
|
||||
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
|
||||
}
|
||||
if opts.RunNCCL {
|
||||
t.Fatalf("RunNCCL should stay false when explicitly disabled")
|
||||
}
|
||||
}
|
||||
|
||||
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
info benchmarkGPUInfo
|
||||
want int
|
||||
}{
|
||||
{
|
||||
name: "prefers default tdp over current derated limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 600,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 600,
|
||||
},
|
||||
{
|
||||
name: "caps default tdp to reported max limit",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 500,
|
||||
DefaultPowerLimitW: 700,
|
||||
MaxPowerLimitW: 650,
|
||||
},
|
||||
want: 650,
|
||||
},
|
||||
{
|
||||
name: "falls back to current limit when default missing",
|
||||
info: benchmarkGPUInfo{
|
||||
PowerLimitW: 525,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
want: 525,
|
||||
},
|
||||
{
|
||||
name: "falls back to max limit when only that is known",
|
||||
info: benchmarkGPUInfo{
|
||||
MaxPowerLimitW: 575,
|
||||
},
|
||||
want: 575,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := strings.Join([]string{
|
||||
"loader=bee-gpu-burn",
|
||||
"[gpu 0] device=NVIDIA H100",
|
||||
"[gpu 0] compute_capability=9.0",
|
||||
"[gpu 0] backend=cublasLt",
|
||||
"[gpu 0] duration_s=10",
|
||||
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||
"[gpu 0] int8_tensor_iterations=80",
|
||||
"[gpu 0] fp16_tensor_iterations=200",
|
||||
"[gpu 0] fp8_e4m3_iterations=50",
|
||||
"[gpu 0] status=OK",
|
||||
}, "\n")
|
||||
|
||||
got := parseBenchmarkBurnLog(raw)
|
||||
if got.Backend != "cublasLt" {
|
||||
t.Fatalf("backend=%q want cublasLt", got.Backend)
|
||||
}
|
||||
if got.ComputeCapability != "9.0" {
|
||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||
}
|
||||
if len(got.Profiles) != 3 {
|
||||
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||
}
|
||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||
}
|
||||
if got.Profiles[0].Category != "fp16_bf16" {
|
||||
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||
}
|
||||
if got.Profiles[1].Category != "fp8" {
|
||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||
}
|
||||
if got.Profiles[2].Category != "int8" {
|
||||
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||
}
|
||||
if got.Profiles[2].Weight != 0.25 {
|
||||
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
result := NvidiaBenchmarkResult{
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "PARTIAL",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "partial",
|
||||
},
|
||||
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
|
||||
GPUs: []BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100",
|
||||
Status: "OK",
|
||||
Steady: BenchmarkTelemetrySummary{
|
||||
AvgPowerW: 680,
|
||||
AvgTempC: 79,
|
||||
AvgGraphicsClockMHz: 1725,
|
||||
P95PowerW: 700,
|
||||
P95TempC: 82,
|
||||
P95GraphicsClockMHz: 1800,
|
||||
},
|
||||
Scores: BenchmarkScorecard{
|
||||
ComputeScore: 1200,
|
||||
PowerSustainScore: 96,
|
||||
ThermalSustainScore: 88,
|
||||
StabilityScore: 92,
|
||||
CompositeScore: 1176,
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
|
||||
},
|
||||
Throttle: BenchmarkThrottleCounters{
|
||||
SWPowerCapUS: 1000000,
|
||||
},
|
||||
DegradationReasons: []string{"power_capped"},
|
||||
},
|
||||
},
|
||||
Cooling: &BenchmarkCoolingSummary{
|
||||
Available: true,
|
||||
AvgFanRPM: 9200,
|
||||
FanDutyCycleAvailable: true,
|
||||
AvgFanDutyCyclePct: 47.5,
|
||||
P95FanDutyCyclePct: 62.0,
|
||||
},
|
||||
}
|
||||
|
||||
report := renderBenchmarkReport(result)
|
||||
for _, needle := range []string{
|
||||
"Executive Summary",
|
||||
"GPU 0 spent measurable time under SW power cap.",
|
||||
"1176.00",
|
||||
"fp16_tensor",
|
||||
"700.00",
|
||||
"Cooling",
|
||||
"Average fan duty cycle",
|
||||
"47.5%",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
report := renderBenchmarkReport(NvidiaBenchmarkResult{
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "OK",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "full",
|
||||
},
|
||||
})
|
||||
|
||||
for _, needle := range []string{
|
||||
"gpu-metrics.csv",
|
||||
"gpu-metrics.html",
|
||||
"gpu-burn.log",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
|
||||
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
|
||||
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
|
||||
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
|
||||
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
|
||||
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
})
|
||||
|
||||
if score.SyntheticScore != 100 {
|
||||
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
|
||||
}
|
||||
if score.MixedScore != 50 {
|
||||
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Min Power Limit : 200.00 W
|
||||
Max Power Limit : 600.00 W
|
||||
Default Power Limit : 575.00 W
|
||||
Current Power Limit : 560.00 W
|
||||
Clocks
|
||||
Graphics : 2422 MHz
|
||||
Memory : 12481 MHz
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
SM : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
Video : 2107 MHz
|
||||
|
||||
GPU 00000000:4F:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
`)
|
||||
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0"},
|
||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||
}
|
||||
if infoByIndex[0].PowerLimitW != 560 {
|
||||
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Min Power Limit : 100.00 W
|
||||
Max Power Limit : 900.00 W
|
||||
Max Clocks
|
||||
Graphics : 9999 MHz
|
||||
Memory : 9999 MHz
|
||||
`)
|
||||
// Already populated — must not be overwritten.
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {
|
||||
Index: 0,
|
||||
BusID: "00000000:4E:00.0",
|
||||
MaxGraphicsClockMHz: 2430,
|
||||
MaxMemoryClockMHz: 12481,
|
||||
MinPowerLimitW: 200,
|
||||
MaxPowerLimitW: 600,
|
||||
},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||
}
|
||||
}
|
||||
536
audit/internal/platform/benchmark_types.go
Normal file
536
audit/internal/platform/benchmark_types.go
Normal file
@@ -0,0 +1,536 @@
|
||||
package platform
|
||||
|
||||
import "time"
|
||||
|
||||
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||
type BenchmarkHostConfig struct {
|
||||
CPUModel string `json:"cpu_model,omitempty"`
|
||||
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||
CPUCores int `json:"cpu_cores,omitempty"`
|
||||
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||
type BenchmarkCPULoad struct {
|
||||
AvgPct float64 `json:"avg_pct"`
|
||||
MaxPct float64 `json:"max_pct"`
|
||||
P95Pct float64 `json:"p95_pct"`
|
||||
Samples int `json:"samples"`
|
||||
// Status is "ok", "high", or "unstable".
|
||||
Status string `json:"status"`
|
||||
Note string `json:"note,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
|
||||
// benchmark run.
|
||||
type BenchmarkCoolingSummary struct {
|
||||
Available bool `json:"available"`
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaBenchmarkProfileStandard = "standard"
|
||||
NvidiaBenchmarkProfileStability = "stability"
|
||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||
)
|
||||
|
||||
const (
|
||||
BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
|
||||
BenchmarkPowerEngineTargetedPower = "targeted_power"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||
// re-measure from actual task logs and update the constants here.
|
||||
//
|
||||
// Sources:
|
||||
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
|
||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||
const (
|
||||
// Performance Benchmark (bee-gpu-burn).
|
||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||
// Sequential per-GPU mode scales approximately linearly.
|
||||
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
|
||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||
|
||||
// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
|
||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||
)
|
||||
|
||||
type NvidiaBenchmarkOptions struct {
|
||||
Profile string
|
||||
SizeMB int
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
RunNCCL bool
|
||||
ServerPowerSource string
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||
RampTotal int // total number of ramp-up steps in this run
|
||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||
}
|
||||
|
||||
const (
|
||||
BenchmarkPowerSourceDCMI = "dcmi"
|
||||
BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
|
||||
)
|
||||
|
||||
type BenchmarkPowerAutotuneConfig struct {
|
||||
Version int `json:"version"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
SelectedSource string `json:"selected_source"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
IdleDurationSec int `json:"idle_duration_sec,omitempty"`
|
||||
LoadDurationSec int `json:"load_duration_sec,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type SystemPowerSourceDecision struct {
|
||||
Configured bool `json:"configured"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
EffectiveSource string `json:"effective_source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded
|
||||
Reason string `json:"reason,omitempty"`
|
||||
ConfiguredAt time.Time `json:"configured_at,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneResult struct {
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||
Profile string `json:"profile,omitempty"`
|
||||
Status string `json:"status"`
|
||||
IdleDurationSec int `json:"idle_duration_sec"`
|
||||
LoadDurationSec int `json:"load_duration_sec"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec"`
|
||||
SelectedSource string `json:"selected_source,omitempty"`
|
||||
IdleValidationError string `json:"idle_validation_error,omitempty"`
|
||||
IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
|
||||
GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"`
|
||||
GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"`
|
||||
Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneValidation struct {
|
||||
Valid bool `json:"valid"`
|
||||
GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
|
||||
GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
|
||||
CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
|
||||
CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
|
||||
GPUSamples int `json:"gpu_samples,omitempty"`
|
||||
CPUSamples int `json:"cpu_samples,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkPowerAutotuneCandidate struct {
|
||||
Source string `json:"source"`
|
||||
IdleAvgW float64 `json:"idle_avg_w,omitempty"`
|
||||
LoadAvgW float64 `json:"load_avg_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
Samples int `json:"samples,omitempty"`
|
||||
RelativeError float64 `json:"relative_error,omitempty"`
|
||||
Confidence float64 `json:"confidence,omitempty"`
|
||||
Selected bool `json:"selected,omitempty"`
|
||||
Available bool `json:"available"`
|
||||
SelectionNotes string `json:"selection_notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalizationGPU struct {
|
||||
Index int `json:"index"`
|
||||
PersistenceMode string `json:"persistence_mode,omitempty"`
|
||||
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
|
||||
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
|
||||
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
|
||||
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkGPUResult struct {
|
||||
Index int `json:"index"`
|
||||
UUID string `json:"uuid,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
VBIOS string `json:"vbios,omitempty"`
|
||||
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||
// Fallback: 80°C.
|
||||
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
|
||||
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||
PrecisionFailures []string `json:"precision_failures,omitempty"`
|
||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||
Scores BenchmarkScorecard `json:"scores"`
|
||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkTelemetrySummary struct {
|
||||
DurationSec float64 `json:"duration_sec"`
|
||||
Samples int `json:"samples"`
|
||||
AvgTempC float64 `json:"avg_temp_c"`
|
||||
P95TempC float64 `json:"p95_temp_c"`
|
||||
AvgPowerW float64 `json:"avg_power_w"`
|
||||
P95PowerW float64 `json:"p95_power_w"`
|
||||
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
|
||||
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
|
||||
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
|
||||
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
|
||||
AvgUsagePct float64 `json:"avg_usage_pct"`
|
||||
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
|
||||
ClockCVPct float64 `json:"clock_cv_pct"`
|
||||
PowerCVPct float64 `json:"power_cv_pct"`
|
||||
TempCVPct float64 `json:"temp_cv_pct"`
|
||||
ClockDriftPct float64 `json:"clock_drift_pct"`
|
||||
}
|
||||
|
||||
type BenchmarkThrottleCounters struct {
|
||||
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
|
||||
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
|
||||
SyncBoostUS uint64 `json:"sync_boost_us"`
|
||||
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
|
||||
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||
}
|
||||
|
||||
// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
|
||||
// Corrected = single-bit errors fixed by ECC (DRAM degradation).
|
||||
// Uncorrected = double-bit errors that could not be corrected (serious fault).
|
||||
// Both are volatile (since last driver reset), not persistent.
|
||||
type BenchmarkECCCounters struct {
|
||||
Corrected uint64 `json:"corrected"`
|
||||
Uncorrected uint64 `json:"uncorrected"`
|
||||
}
|
||||
|
||||
func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
|
||||
func (e BenchmarkECCCounters) IsZero() bool { return e.Corrected == 0 && e.Uncorrected == 0 }
|
||||
|
||||
type BenchmarkPrecisionResult struct {
|
||||
Name string `json:"name"`
|
||||
Category string `json:"category"`
|
||||
Supported bool `json:"supported"`
|
||||
Lanes int `json:"lanes,omitempty"`
|
||||
M uint64 `json:"m,omitempty"`
|
||||
N uint64 `json:"n,omitempty"`
|
||||
K uint64 `json:"k,omitempty"`
|
||||
Iterations uint64 `json:"iterations,omitempty"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
// Weight is the fp32-equivalence factor for this precision category.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||
Weight float64 `json:"weight,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkScorecard struct {
|
||||
ComputeScore float64 `json:"compute_score"`
|
||||
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
|
||||
// steady phases (each precision ran alone, full GPU dedicated).
|
||||
SyntheticScore float64 `json:"synthetic_score,omitempty"`
|
||||
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
|
||||
// (all precisions competing simultaneously — closer to real workloads).
|
||||
MixedScore float64 `json:"mixed_score,omitempty"`
|
||||
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
|
||||
// sustains throughput under concurrent mixed-precision load.
|
||||
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||
// StabilityScore: fraction of steady-state time the GPU spent throttling
|
||||
// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
|
||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||
|
||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||
// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
|
||||
// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
|
||||
TempHeadroomC float64 `json:"temp_headroom_c"`
|
||||
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
// ServerQualityScore (0–100) reflects server infrastructure quality independent
|
||||
// of GPU model. Combines throttle time, power variance, and temp variance.
|
||||
// Use this to compare servers with the same GPU, or to flag a bad server
|
||||
// that throttles an otherwise fast GPU.
|
||||
ServerQualityScore float64 `json:"server_quality_score"`
|
||||
// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
|
||||
// A throttling GPU will score lower here automatically — no quality multiplier.
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
|
||||
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
|
||||
// so benchmark and audit data can be correlated by slot.
|
||||
type BenchmarkPSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
|
||||
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power from multiple independent
|
||||
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
|
||||
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
|
||||
// covers only a subset of installed PSUs (partial coverage).
|
||||
//
|
||||
// Source legend:
|
||||
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
|
||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Mode string `json:"mode,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
|
||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
|
||||
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
|
||||
|
||||
// PSU DC output sum — power delivered to server internals after conversion.
|
||||
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
|
||||
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
|
||||
|
||||
// Per-slot PSU readings at idle and at peak load.
|
||||
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
|
||||
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
|
||||
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
|
||||
|
||||
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
|
||||
// PCIe slot delivery only (excludes 16-pin connector power).
|
||||
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
|
||||
|
||||
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
|
||||
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
|
||||
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
|
||||
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||
// during a dedicated single-precision steady window. Because only one kernel
|
||||
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||
type BenchmarkPrecisionSteadyPhase struct {
|
||||
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||
Status string `json:"status,omitempty"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
// ECC errors accumulated during this precision phase only.
|
||||
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||
// Any uncorrected = serious fault triggered by this precision workload.
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkInterconnectResult struct {
|
||||
Status string `json:"status"`
|
||||
Attempted bool `json:"attempted"`
|
||||
Supported bool `json:"supported"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
|
||||
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
|
||||
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
|
||||
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
|
||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||
// this server under full GPU load. Use for rack power planning.
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||
// actual wall-power draw as seen by the server's power supply.
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||
// stably with all other GPUs running simultaneously at their own limits.
|
||||
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||
// additional derating.
|
||||
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// Telemetry holds the aggregated stats from the final converged calibration
|
||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||
// Fan state sampled at the end of single-card calibration.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||
NewGPUIndex int `json:"new_gpu_index"`
|
||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// PSU slot readings sampled at end of this ramp step.
|
||||
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||
// Fan state at end of this ramp step.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||
}
|
||||
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
// scalability ramp-up phase of the performance benchmark.
|
||||
type NvidiaPerformanceRampStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||
ScalabilityPct float64 `json:"scalability_pct"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
@@ -13,19 +13,27 @@ import (
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
Stage string `json:"stage,omitempty"`
|
||||
StageStartSec float64 `json:"stage_start_sec,omitempty"`
|
||||
StageEndSec float64 `json:"stage_end_sec,omitempty"`
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
@@ -46,7 +54,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 6 {
|
||||
if len(parts) < 7 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
@@ -57,6 +65,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
MemUsagePct: parseGPUFloat(parts[3]),
|
||||
PowerW: parseGPUFloat(parts[4]),
|
||||
ClockMHz: parseGPUFloat(parts[5]),
|
||||
MemClockMHz: parseGPUFloat(parts[6]),
|
||||
})
|
||||
}
|
||||
return rows, nil
|
||||
@@ -139,14 +148,28 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||
for _, r := range rows {
|
||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
||||
dutyAvail := 0
|
||||
if r.FanDutyCycleAvailable {
|
||||
dutyAvail = 1
|
||||
}
|
||||
dutyEstimated := 0
|
||||
if r.FanDutyCycleEstimated {
|
||||
dutyEstimated = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
type gpuMetricStageSpan struct {
|
||||
Name string
|
||||
Start float64
|
||||
End float64
|
||||
}
|
||||
|
||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
// Group by GPU index preserving order.
|
||||
@@ -161,9 +184,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
stageSpans := buildGPUMetricStageSpans(rows)
|
||||
stageColorByName := make(map[string]string, len(stageSpans))
|
||||
for i, span := range stageSpans {
|
||||
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
|
||||
}
|
||||
|
||||
var legend strings.Builder
|
||||
if len(stageSpans) > 0 {
|
||||
legend.WriteString(`<div class="stage-legend">`)
|
||||
for _, span := range stageSpans {
|
||||
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
|
||||
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
|
||||
}
|
||||
legend.WriteString(`</div>`)
|
||||
}
|
||||
|
||||
var svgs strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
|
||||
svgs.WriteString("\n")
|
||||
}
|
||||
|
||||
@@ -173,21 +212,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
<meta charset="utf-8">
|
||||
<title>GPU Stress Test Metrics</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
||||
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
||||
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
|
||||
*{box-sizing:border-box}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
|
||||
.page{padding:24px}
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
|
||||
.card-body{padding:16px}
|
||||
h1{font-size:22px;margin:0 0 6px}
|
||||
p{color:var(--muted);font-size:13px;margin:0 0 16px}
|
||||
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
|
||||
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
|
||||
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
|
||||
.chart-block{margin-top:16px}
|
||||
</style>
|
||||
</head><body>
|
||||
<div class="page">
|
||||
<div class="card">
|
||||
<div class="card-head">GPU Stress Test Metrics</div>
|
||||
<div class="card-body">
|
||||
<h1>GPU Stress Test Metrics</h1>
|
||||
<p>Generated %s</p>
|
||||
%s
|
||||
</body></html>`, ts, svgs.String())
|
||||
<div class="chart-block">%s</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`, ts, legend.String(), svgs.String())
|
||||
|
||||
return os.WriteFile(path, []byte(html), 0644)
|
||||
}
|
||||
|
||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
|
||||
// Layout
|
||||
const W, H = 960, 520
|
||||
const plotX1 = 120 // usage axis / chart left border
|
||||
@@ -197,7 +254,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
const PW = plotX2 - plotX1
|
||||
const PH = plotY2 - plotY1
|
||||
// Outer axes
|
||||
const tempAxisX = 60 // temp axis line
|
||||
const tempAxisX = 60 // temp axis line
|
||||
const clockAxisX = 900 // clock axis line
|
||||
|
||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||
@@ -282,6 +339,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Stage backgrounds
|
||||
for _, span := range stageSpans {
|
||||
x1 := xv(span.Start)
|
||||
x2 := xv(span.End)
|
||||
if x2 < x1 {
|
||||
x1, x2 = x2, x1
|
||||
}
|
||||
if x2-x1 < 1 {
|
||||
x2 = x1 + 1
|
||||
}
|
||||
color := stageColorByName[span.Name]
|
||||
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
|
||||
x1, plotY1, x2-x1, PH, color)
|
||||
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
|
||||
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
|
||||
}
|
||||
|
||||
// Chart border
|
||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
@@ -380,224 +454,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const (
|
||||
ansiRed = "\033[31m"
|
||||
ansiBlue = "\033[34m"
|
||||
ansiGreen = "\033[32m"
|
||||
ansiYellow = "\033[33m"
|
||||
ansiReset = "\033[0m"
|
||||
)
|
||||
|
||||
const (
|
||||
termChartWidth = 70
|
||||
termChartHeight = 12
|
||||
)
|
||||
|
||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||
// Used in SAT stress-test logs.
|
||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
type seriesDef struct {
|
||||
caption string
|
||||
color string
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
gr := gpuMap[gpuIdx]
|
||||
if len(gr) == 0 {
|
||||
continue
|
||||
}
|
||||
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
||||
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
||||
for _, d := range defs {
|
||||
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
||||
termChartHeight, termChartWidth))
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||
if len(vals) == 0 {
|
||||
return caption + "\n"
|
||||
}
|
||||
|
||||
mn, mx := gpuMinMax(vals)
|
||||
if mn == mx {
|
||||
mx = mn + 1
|
||||
}
|
||||
|
||||
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
||||
w := width
|
||||
if len(vals) < w {
|
||||
w = len(vals)
|
||||
}
|
||||
data := gpuDownsample(vals, w)
|
||||
|
||||
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
||||
row := make([]int, w)
|
||||
for i, v := range data {
|
||||
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
||||
if r < 0 {
|
||||
r = 0
|
||||
}
|
||||
if r > height {
|
||||
r = height
|
||||
}
|
||||
row[i] = r
|
||||
}
|
||||
|
||||
// Fill the character grid.
|
||||
grid := make([][]rune, height+1)
|
||||
for i := range grid {
|
||||
grid[i] = make([]rune, w)
|
||||
for j := range grid[i] {
|
||||
grid[i][j] = ' '
|
||||
}
|
||||
}
|
||||
for x := 0; x < w; x++ {
|
||||
r := row[x]
|
||||
if x == 0 {
|
||||
grid[r][0] = '─'
|
||||
continue
|
||||
}
|
||||
p := row[x-1]
|
||||
switch {
|
||||
case r == p:
|
||||
grid[r][x] = '─'
|
||||
case r < p: // value went up (row index decreased toward top)
|
||||
grid[r][x] = '╭'
|
||||
grid[p][x] = '╯'
|
||||
for y := r + 1; y < p; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
default: // r > p, value went down
|
||||
grid[p][x] = '╮'
|
||||
grid[r][x] = '╰'
|
||||
for y := p + 1; y < r; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Y axis tick labels.
|
||||
ticks := gpuNiceTicks(mn, mx, height/2)
|
||||
tickAtRow := make(map[int]string)
|
||||
labelWidth := 4
|
||||
for _, t := range ticks {
|
||||
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
||||
if r < 0 || r > height {
|
||||
continue
|
||||
}
|
||||
s := gpuFormatTick(t)
|
||||
tickAtRow[r] = s
|
||||
if len(s) > labelWidth {
|
||||
labelWidth = len(s)
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for r := 0; r <= height; r++ {
|
||||
label := tickAtRow[r]
|
||||
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
||||
switch {
|
||||
case label != "":
|
||||
b.WriteRune('┤')
|
||||
case r == height:
|
||||
b.WriteRune('┼')
|
||||
default:
|
||||
b.WriteRune('│')
|
||||
}
|
||||
b.WriteString(color)
|
||||
b.WriteString(string(grid[r]))
|
||||
b.WriteString(ansiReset)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
// Bottom axis.
|
||||
b.WriteString(strings.Repeat(" ", labelWidth))
|
||||
b.WriteRune('└')
|
||||
b.WriteString(strings.Repeat("─", w))
|
||||
b.WriteRune('\n')
|
||||
|
||||
// Caption centered under the chart.
|
||||
if caption != "" {
|
||||
total := labelWidth + 1 + w
|
||||
if pad := (total - len(caption)) / 2; pad > 0 {
|
||||
b.WriteString(strings.Repeat(" ", pad))
|
||||
}
|
||||
b.WriteString(caption)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
||||
v := make([]float64, len(rows))
|
||||
for i, r := range rows {
|
||||
v[i] = fn(r)
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
||||
func gpuDownsample(vals []float64, w int) []float64 {
|
||||
n := len(vals)
|
||||
if n == 0 {
|
||||
return make([]float64, w)
|
||||
}
|
||||
result := make([]float64, w)
|
||||
if n >= w {
|
||||
counts := make([]int, w)
|
||||
for i, v := range vals {
|
||||
bucket := i * w / n
|
||||
if bucket >= w {
|
||||
bucket = w - 1
|
||||
}
|
||||
result[bucket] += v
|
||||
counts[bucket]++
|
||||
}
|
||||
for i := range result {
|
||||
if counts[i] > 0 {
|
||||
result[i] /= float64(counts[i])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Nearest-neighbour upsample.
|
||||
for i := range result {
|
||||
src := i * (n - 1) / (w - 1)
|
||||
if src >= n {
|
||||
src = n - 1
|
||||
}
|
||||
result[i] = vals[src]
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func gpuMinMax(vals []float64) (float64, float64) {
|
||||
if len(vals) == 0 {
|
||||
return 0, 1
|
||||
@@ -642,3 +498,57 @@ func gpuFormatTick(v float64) string {
|
||||
}
|
||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||
}
|
||||
|
||||
var gpuMetricStagePalette = []string{
|
||||
"#d95c5c",
|
||||
"#2185d0",
|
||||
"#21ba45",
|
||||
"#f2c037",
|
||||
"#6435c9",
|
||||
"#00b5ad",
|
||||
"#a5673f",
|
||||
}
|
||||
|
||||
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||
var spans []gpuMetricStageSpan
|
||||
for _, row := range rows {
|
||||
name := strings.TrimSpace(row.Stage)
|
||||
if name == "" {
|
||||
name = "run"
|
||||
}
|
||||
start := row.StageStartSec
|
||||
end := row.StageEndSec
|
||||
if end <= start {
|
||||
start = row.ElapsedSec
|
||||
end = row.ElapsedSec
|
||||
}
|
||||
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
|
||||
continue
|
||||
}
|
||||
if start < spans[len(spans)-1].Start {
|
||||
spans[len(spans)-1].Start = start
|
||||
}
|
||||
if end > spans[len(spans)-1].End {
|
||||
spans[len(spans)-1].End = end
|
||||
}
|
||||
}
|
||||
for i := range spans {
|
||||
if spans[i].End <= spans[i].Start {
|
||||
spans[i].End = spans[i].Start + 1
|
||||
}
|
||||
}
|
||||
return spans
|
||||
}
|
||||
|
||||
var gpuHTMLReplacer = strings.NewReplacer(
|
||||
"&", "&",
|
||||
"<", "<",
|
||||
">", ">",
|
||||
`"`, """,
|
||||
"'", "'",
|
||||
)
|
||||
|
||||
func gpuHTMLEscape(s string) string {
|
||||
return gpuHTMLReplacer.Replace(s)
|
||||
}
|
||||
|
||||
65
audit/internal/platform/gpu_metrics_test.go
Normal file
65
audit/internal/platform/gpu_metrics_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.csv")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
|
||||
}
|
||||
if err := WriteGPUMetricsCSV(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsCSV: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage,elapsed_sec,gpu_index",
|
||||
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("csv missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.html")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
|
||||
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
|
||||
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
|
||||
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
|
||||
}
|
||||
if err := WriteGPUMetricsHTML(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsHTML: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage-legend",
|
||||
"baseline",
|
||||
"steady-fp16",
|
||||
"GPU Stress Test Metrics",
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("html missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,10 +11,10 @@ import (
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
func mountSource(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func mountFSType(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceType(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceTransport(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
switch {
|
||||
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||
return "ram"
|
||||
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||
return "cdrom"
|
||||
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||
return "usb"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||
return "cdrom"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||
return "disk"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
|
||||
@@ -11,50 +11,200 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const installToRAMDir = "/dev/shm/bee-live"
|
||||
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||
return s.LiveMediaRAMState().InRAM
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
source := mountSource("/run/live/medium")
|
||||
device := findLiveBootDevice()
|
||||
status := LiveBootSource{
|
||||
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||
Source: source,
|
||||
Device: device,
|
||||
}
|
||||
if fsType == "" && source == "" && device == "" {
|
||||
if toramActive() {
|
||||
status.InRAM = true
|
||||
status.Kind = "ram"
|
||||
status.Source = "tmpfs"
|
||||
return status
|
||||
}
|
||||
status.Kind = "unknown"
|
||||
return status
|
||||
}
|
||||
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||
if status.Kind == "" {
|
||||
status.Kind = "unknown"
|
||||
}
|
||||
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||
status.Source = "tmpfs"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
|
||||
return evaluateLiveMediaRAMState(
|
||||
s.LiveBootSource(),
|
||||
toramActive(),
|
||||
globPaths("/run/live/medium/live/*.squashfs"),
|
||||
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
|
||||
)
|
||||
}
|
||||
|
||||
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
|
||||
state := LiveMediaRAMState{
|
||||
LiveBootSource: status,
|
||||
ToramActive: toram,
|
||||
CopyPresent: len(copiedSquashfs) > 0,
|
||||
}
|
||||
if status.InRAM {
|
||||
state.State = "in_ram"
|
||||
state.Status = "ok"
|
||||
state.CopyComplete = true
|
||||
state.Message = "Running from RAM — installation media can be safely disconnected."
|
||||
return state
|
||||
}
|
||||
|
||||
expected := pathBaseSet(sourceSquashfs)
|
||||
copied := pathBaseSet(copiedSquashfs)
|
||||
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
|
||||
|
||||
switch {
|
||||
case state.CopyComplete:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
|
||||
case state.CopyPresent:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
|
||||
case toram:
|
||||
state.State = "toram_failed"
|
||||
state.Status = "failed"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
|
||||
default:
|
||||
state.State = "not_in_ram"
|
||||
state.Status = "warning"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
|
||||
}
|
||||
return state
|
||||
}
|
||||
|
||||
func globPaths(pattern string) []string {
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
return matches
|
||||
}
|
||||
|
||||
func pathBaseSet(paths []string) map[string]struct{} {
|
||||
out := make(map[string]struct{}, len(paths))
|
||||
for _, path := range paths {
|
||||
base := strings.TrimSpace(filepath.Base(path))
|
||||
if base != "" {
|
||||
out[base] = struct{}{}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func setContainsAll(have, want map[string]struct{}) bool {
|
||||
if len(want) == 0 {
|
||||
return false
|
||||
}
|
||||
for name := range want {
|
||||
if _, ok := have[name]; !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
if s.IsLiveMediaInRAM() {
|
||||
state := s.LiveMediaRAMState()
|
||||
if state.InRAM {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
dstDir := installToRAMDir
|
||||
|
||||
// If the source medium is unavailable, check whether a previous run already
|
||||
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||
// directly to the loop-rebind / bind-mount steps.
|
||||
if !sourceAvailable {
|
||||
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(copiedFiles) > 0 {
|
||||
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||
// Proceed to rebind with the already-copied files.
|
||||
for _, dst := range copiedFiles {
|
||||
base := filepath.Base(dst)
|
||||
// Re-associate the loop device that was originally backed by the
|
||||
// source file (now gone); find it by the old source path pattern.
|
||||
srcGuess := "/run/live/medium/live/" + base
|
||||
loopDev, lerr := findLoopForFile(srcGuess)
|
||||
if lerr != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||
continue
|
||||
}
|
||||
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||
}
|
||||
|
||||
dstDir := "/dev/shm/bee-live"
|
||||
{
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
}
|
||||
|
||||
if state.CopyPresent {
|
||||
log("Removing stale partial RAM copy before retry...")
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
return
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
log("Removed incomplete RAM copy.")
|
||||
}()
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -80,6 +230,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
}
|
||||
}
|
||||
|
||||
bindMedium:
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
@@ -87,14 +238,71 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||
|
||||
mediumRebound := false
|
||||
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
|
||||
} else {
|
||||
mediumRebound = true
|
||||
}
|
||||
|
||||
log("Done. Installation media can be safely disconnected.")
|
||||
log("Verifying live medium now served from RAM...")
|
||||
status := s.LiveBootSource()
|
||||
if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
|
||||
return err
|
||||
}
|
||||
if status.InRAM {
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
}
|
||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
}
|
||||
|
||||
// The live medium mount was not redirected to RAM. This is expected when
|
||||
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
|
||||
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
|
||||
// because the CD-ROM mount is in use. Check whether files were at least
|
||||
// copied to the tmpfs directory — that is sufficient for safe disconnection
|
||||
// once the kernel has paged in all actively-used data.
|
||||
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(files) > 0 {
|
||||
if !mediumRebound {
|
||||
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
|
||||
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
|
||||
}
|
||||
|
||||
func describeLiveBootSource(status LiveBootSource) string {
|
||||
source := strings.TrimSpace(status.Device)
|
||||
if source == "" {
|
||||
source = strings.TrimSpace(status.Source)
|
||||
}
|
||||
if source == "" {
|
||||
source = "unknown source"
|
||||
}
|
||||
switch strings.TrimSpace(status.Kind) {
|
||||
case "ram":
|
||||
return "RAM"
|
||||
case "usb":
|
||||
return "USB (" + source + ")"
|
||||
case "cdrom":
|
||||
return "CD-ROM (" + source + ")"
|
||||
case "disk":
|
||||
return "disk (" + source + ")"
|
||||
default:
|
||||
return source
|
||||
}
|
||||
}
|
||||
|
||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
@@ -112,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
var lastLogged int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
@@ -123,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
if shouldLogCopyProgress(copied, total, lastLogged) {
|
||||
lastLogged = copied
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
@@ -138,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
|
||||
if total <= 0 || copied <= 0 {
|
||||
return false
|
||||
}
|
||||
if copied >= total {
|
||||
return copied > lastLogged
|
||||
}
|
||||
if copied < copyProgressLogStep {
|
||||
return false
|
||||
}
|
||||
return copied-lastLogged >= copyProgressLogStep
|
||||
}
|
||||
|
||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if ctx.Err() != nil {
|
||||
@@ -183,7 +406,31 @@ func findLoopForFile(backingFile string) (string, error) {
|
||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||
}
|
||||
|
||||
// loopDeviceOffset returns the byte offset configured for the loop device,
|
||||
// or -1 if it cannot be determined.
|
||||
func loopDeviceOffset(loopDev string) int64 {
|
||||
out, err := exec.Command("losetup", "--json", loopDev).Output()
|
||||
if err != nil {
|
||||
return -1
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Offset int64 `json:"offset"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
|
||||
return -1
|
||||
}
|
||||
return result.Loopdevices[0].Offset
|
||||
}
|
||||
|
||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
|
||||
// typically set up with a non-zero offset (squashfs lives inside the ISO),
|
||||
// so the ioctl returns EINVAL. Detect this early for a clear error message.
|
||||
if off := loopDeviceOffset(loopDev); off > 0 {
|
||||
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
|
||||
}
|
||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
|
||||
func bindMount(src, dst string) error {
|
||||
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
|
||||
}
|
||||
|
||||
@@ -7,3 +7,7 @@ import "errors"
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||
}
|
||||
|
||||
func bindMount(src, dst string) error {
|
||||
return errors.New("bind mount not available on this platform")
|
||||
}
|
||||
|
||||
126
audit/internal/platform/install_to_ram_test.go
Normal file
126
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
source string
|
||||
deviceType string
|
||||
transport string
|
||||
want string
|
||||
}{
|
||||
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dstDir := t.TempDir()
|
||||
|
||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
|
||||
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||
}
|
||||
|
||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
|
||||
if err == nil {
|
||||
t.Fatal("expected verification failure when media is still on USB")
|
||||
}
|
||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
|
||||
t.Fatalf("error=%q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeLiveBootSource(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
|
||||
t.Fatalf("got %q want RAM", got)
|
||||
}
|
||||
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
|
||||
t.Fatalf("got %q want /run/live/medium", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("in_ram", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
|
||||
false,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("partial_copy_after_cancel", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
false,
|
||||
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
|
||||
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
|
||||
)
|
||||
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
if state.CopyComplete {
|
||||
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("toram_failed", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
true,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestShouldLogCopyProgress(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
total := int64(250 * 1024 * 1024)
|
||||
step := int64(100 * 1024 * 1024)
|
||||
|
||||
if shouldLogCopyProgress(step-1, total, 0) {
|
||||
t.Fatal("progress logged too early")
|
||||
}
|
||||
if !shouldLogCopyProgress(step, total, 0) {
|
||||
t.Fatal("expected log at first 100MB boundary")
|
||||
}
|
||||
if shouldLogCopyProgress(step+16*1024*1024, total, step) {
|
||||
t.Fatal("progress logged again before next 100MB")
|
||||
}
|
||||
if !shouldLogCopyProgress(2*step, total, step) {
|
||||
t.Fatal("expected log at second 100MB boundary")
|
||||
}
|
||||
if !shouldLogCopyProgress(total, total, 2*step) {
|
||||
t.Fatal("expected final completion log")
|
||||
}
|
||||
}
|
||||
83
audit/internal/platform/kill_workers.go
Normal file
83
audit/internal/platform/kill_workers.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
// bee test worker processes that should be killed by KillTestWorkers.
|
||||
var workerPatterns = []string{
|
||||
"bee-gpu-burn",
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
"dcgmi",
|
||||
}
|
||||
|
||||
// KilledProcess describes a process that was sent SIGKILL.
|
||||
type KilledProcess struct {
|
||||
PID int `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
// The scan runs under a 5-second deadline to avoid blocking if the process
|
||||
// table is very large (e.g. after a stress test with thousands of children).
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
|
||||
return killed
|
||||
default:
|
||||
}
|
||||
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
pid, err := strconv.Atoi(e.Name())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// /proc/*/cmdline uses NUL bytes as argument separators.
|
||||
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
||||
exe := strings.TrimSpace(args[0])
|
||||
base := exe
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
@@ -1,8 +1,10 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bee/audit/internal/collector"
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
@@ -14,13 +16,24 @@ import (
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
PowerSource string `json:"power_source,omitempty"`
|
||||
PowerMode string `json:"power_mode,omitempty"`
|
||||
PowerReason string `json:"power_reason,omitempty"`
|
||||
PSUs []PSUReading `json:"psus,omitempty"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// PSUReading is a per-slot power supply input power reading.
|
||||
type PSUReading struct {
|
||||
Slot int `json:"slot"`
|
||||
Name string `json:"name"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
}
|
||||
}
|
||||
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||
s.PSUs = samplePSUPower()
|
||||
|
||||
// System power: use the global autotune-selected source when configured,
|
||||
// otherwise fall back to the historical heuristic and mark the mode.
|
||||
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
|
||||
s.PowerW = powerW
|
||||
s.PowerSource = decision.EffectiveSource
|
||||
s.PowerMode = decision.Mode
|
||||
s.PowerReason = decision.Reason
|
||||
}
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
@@ -68,18 +90,20 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
|
||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||
// the overall CPU utilisation percentage.
|
||||
var cpuStatPrev [2]uint64 // [total, idle]
|
||||
|
||||
func sampleCPULoadPct() float64 {
|
||||
total, idle := readCPUStat()
|
||||
if total == 0 {
|
||||
total0, idle0 := readCPUStat()
|
||||
if total0 == 0 {
|
||||
return 0
|
||||
}
|
||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
||||
cpuStatPrev = [2]uint64{total, idle}
|
||||
if prevTotal == 0 {
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
total1, idle1 := readCPUStat()
|
||||
if total1 == 0 {
|
||||
return 0
|
||||
}
|
||||
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||
}
|
||||
|
||||
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||
dt := float64(total - prevTotal)
|
||||
di := float64(idle - prevIdle)
|
||||
if dt <= 0 {
|
||||
@@ -324,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
|
||||
}
|
||||
return chip + " / " + name
|
||||
}
|
||||
|
||||
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
|
||||
// vendors where PSU sensors may not carry entity ID "10.N".
|
||||
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||
func samplePSUPower() []PSUReading {
|
||||
out, err := exec.Command("ipmitool", "sdr").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := collector.PSUSlotsFromSDR(string(out))
|
||||
if len(slots) == 0 {
|
||||
return nil
|
||||
}
|
||||
// Collect slot keys and sort for stable output.
|
||||
keys := make([]int, 0, len(slots))
|
||||
for k := range slots {
|
||||
n, err := strconv.Atoi(k)
|
||||
if err == nil {
|
||||
keys = append(keys, n)
|
||||
}
|
||||
}
|
||||
sort.Ints(keys)
|
||||
psus := make([]PSUReading, 0, len(keys))
|
||||
for _, k := range keys {
|
||||
entry := slots[strconv.Itoa(k)]
|
||||
// Prefer AC input power; fall back to DC output power.
|
||||
var w float64
|
||||
if entry.InputW != nil && *entry.InputW > 0 {
|
||||
w = *entry.InputW
|
||||
} else if entry.OutputW != nil && *entry.OutputW > 0 {
|
||||
w = *entry.OutputW
|
||||
}
|
||||
if w <= 0 {
|
||||
continue
|
||||
}
|
||||
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
|
||||
}
|
||||
if len(psus) == 0 {
|
||||
return nil
|
||||
}
|
||||
return psus
|
||||
}
|
||||
|
||||
@@ -42,3 +42,53 @@ func TestCompactAmbientTempName(t *testing.T) {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPULoadPctBetween(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
prevTotal uint64
|
||||
prevIdle uint64
|
||||
total uint64
|
||||
idle uint64
|
||||
want float64
|
||||
}{
|
||||
{
|
||||
name: "busy half",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 90,
|
||||
want: 50,
|
||||
},
|
||||
{
|
||||
name: "fully busy",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 40,
|
||||
want: 100,
|
||||
},
|
||||
{
|
||||
name: "no progress",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 100,
|
||||
idle: 40,
|
||||
want: 0,
|
||||
},
|
||||
{
|
||||
name: "idle delta larger than total clamps to zero",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 150,
|
||||
want: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
30
audit/internal/platform/nvidia_recover.go
Normal file
30
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||
|
||||
func runNvidiaRecover(args ...string) (string, error) {
|
||||
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||
cmdArgs := []string{
|
||||
"systemd-run",
|
||||
"--quiet",
|
||||
"--pipe",
|
||||
"--wait",
|
||||
"--collect",
|
||||
"--service-type=oneshot",
|
||||
"--unit", unit,
|
||||
}
|
||||
cmdArgs = append(cmdArgs, helperArgs...)
|
||||
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
@@ -16,12 +16,23 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case NvidiaStressLoaderJohn:
|
||||
return "gpu-nvidia-john"
|
||||
case NvidiaStressLoaderNCCL:
|
||||
return "gpu-nvidia-nccl"
|
||||
default:
|
||||
return "gpu-nvidia-burn"
|
||||
}
|
||||
}
|
||||
|
||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
@@ -38,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -52,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -84,9 +101,7 @@ func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
|
||||
@@ -10,9 +10,11 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -24,7 +26,8 @@ type PlatformStressCycle struct {
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
Cycles []PlatformStressCycle
|
||||
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
@@ -66,8 +69,11 @@ func (s *System) RunPlatformStress(
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
@@ -86,27 +92,31 @@ func (s *System) RunPlatformStress(
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
if hasCPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
}
|
||||
|
||||
// GPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
if hasGPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
}
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
@@ -151,13 +161,7 @@ func (s *System) RunPlatformStress(
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
@@ -374,10 +378,24 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||
}
|
||||
// Use a very long duration; the context timeout will kill it at the right time.
|
||||
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
||||
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||
if threads := platformStressCPUThreads(); threads > 0 {
|
||||
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||
}
|
||||
if mb := platformStressMemoryMB(); mb > 0 {
|
||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := cmd.Start(); err != nil {
|
||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||
}
|
||||
return cmd, nil
|
||||
@@ -385,28 +403,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx)
|
||||
return buildAMDGPUStressCmd(ctx, durSec)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx)
|
||||
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := `actions:
|
||||
cfg := fmt.Sprintf(`actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: 86400000
|
||||
duration: %d`, durSec*1000) + `
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
@@ -416,13 +434,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
@@ -430,13 +455,72 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||
// where the context is cancelled early (user stop, parent timeout).
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func platformStressCPUThreads() int {
|
||||
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||
return n
|
||||
}
|
||||
cpus := runtime.NumCPU()
|
||||
switch {
|
||||
case cpus <= 2:
|
||||
return 1
|
||||
case cpus <= 8:
|
||||
return cpus - 1
|
||||
default:
|
||||
return cpus - 2
|
||||
}
|
||||
}
|
||||
|
||||
func platformStressMemoryMB() int {
|
||||
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||
return mb
|
||||
}
|
||||
free := freeMemBytes()
|
||||
if free <= 0 {
|
||||
return 0
|
||||
}
|
||||
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||
if mb < 1024 {
|
||||
return 1024
|
||||
}
|
||||
return mb
|
||||
}
|
||||
|
||||
func containsComponent(components []string, name string) bool {
|
||||
for _, c := range components {
|
||||
if c == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
|
||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||
if got := platformStressCPUThreads(); got != 7 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||
got := platformStressCPUThreads()
|
||||
if got < 1 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||
}
|
||||
if got > runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||
if got := platformStressMemoryMB(); got != 8192 {
|
||||
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
@@ -27,6 +28,8 @@ var runtimeTrackedServices = []string{
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
"nvidia-dcgm",
|
||||
"nvidia-fabricmanager",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
@@ -114,6 +117,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
}
|
||||
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
s.collectToRAMHealth(&health)
|
||||
s.collectUSBExportHealth(&health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
@@ -135,12 +140,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
case "nvidia":
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"dcgmi",
|
||||
"nv-hostengine",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
"bee-nccl-gpu-stress",
|
||||
"all_reduce_perf",
|
||||
})...)
|
||||
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||
@@ -155,11 +163,130 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
return tools
|
||||
}
|
||||
|
||||
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||
for _, candidate := range candidates {
|
||||
path, err := exec.LookPath(candidate)
|
||||
if err == nil {
|
||||
return ToolStatus{Name: display, Path: path, OK: true}
|
||||
}
|
||||
}
|
||||
return ToolStatus{Name: display}
|
||||
}
|
||||
|
||||
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
||||
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
||||
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
||||
// "failed" = toram was requested but medium is not in RAM.
|
||||
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||
state := s.LiveMediaRAMState()
|
||||
health.ToRAMStatus = state.Status
|
||||
switch state.Status {
|
||||
case "ok":
|
||||
return
|
||||
case "failed":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_failed",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
case "partial":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_partial",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
||||
// suitable for log export. Sets USBExportPath to the first match found.
|
||||
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
||||
health.USBExportPath = findUSBExportMount()
|
||||
}
|
||||
|
||||
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
||||
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
||||
// has USB transport. Returns "" if none found.
|
||||
func findUSBExportMount() string {
|
||||
f, err := os.Open("/proc/mounts")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// fs types that are expected on USB export drives
|
||||
exportFSTypes := map[string]bool{
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"fuseblk": true,
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
// fields: device mountpoint fstype options dump pass
|
||||
fields := strings.Fields(scanner.Text())
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||
if !exportFSTypes[strings.ToLower(fsType)] {
|
||||
continue
|
||||
}
|
||||
// Skip read-only mounts
|
||||
opts := strings.Split(options, ",")
|
||||
readOnly := false
|
||||
for _, o := range opts {
|
||||
if strings.TrimSpace(o) == "ro" {
|
||||
readOnly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if readOnly {
|
||||
continue
|
||||
}
|
||||
// Check USB transport via lsblk on the device (or its parent disk for partitions).
|
||||
if !strings.HasPrefix(device, "/dev/") {
|
||||
continue
|
||||
}
|
||||
checkDev := device
|
||||
// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
|
||||
// Strip trailing partition digits to get the parent disk name.
|
||||
if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
|
||||
checkDev = trimmed
|
||||
}
|
||||
if blockDeviceTransport(checkDev) == "usb" {
|
||||
return mountPoint
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_stuck",
|
||||
Severity: "critical",
|
||||
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||
})
|
||||
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_disabled",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||
})
|
||||
}
|
||||
}
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
|
||||
@@ -16,14 +16,64 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||
// production logs in _benchmark/_v8/.
|
||||
//
|
||||
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||
// from actual task logs and update the matching constant here.
|
||||
//
|
||||
// Sources:
|
||||
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||
// - SATEstimatedNvidiaGPUValidatePerGPUSec: xFusion v8.6/v8.22 — 77–87 s/GPU
|
||||
// - SATEstimatedNvidiaGPUStressPerGPUSec: xFusion v8.6/v8.22 — 444–448 s/GPU
|
||||
// - SATEstimatedNvidiaTargetedStressPerGPUSec: xFusion v8.6/v8.22 — 347–348 s/GPU (300 s default + overhead)
|
||||
// - SATEstimatedNvidiaTargetedPowerPerGPUSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU
|
||||
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||
const (
|
||||
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||
SATEstimatedCPUValidateSec = 65
|
||||
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||
SATEstimatedCPUStressSec = 1800
|
||||
|
||||
// RAM: memtester 256 MB / 1 pass.
|
||||
SATEstimatedMemoryValidateSec = 70
|
||||
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||
SATEstimatedMemoryStressSec = 140
|
||||
|
||||
// NVIDIA dcgmi diag Level 2 (medium), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUValidatePerGPUSec = 85
|
||||
// NVIDIA dcgmi diag Level 3 (targeted stress), per GPU, sequential.
|
||||
SATEstimatedNvidiaGPUStressPerGPUSec = 450
|
||||
|
||||
// NVIDIA dcgmi targeted_stress 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedStressPerGPUSec = 350
|
||||
// NVIDIA dcgmi targeted_power 300 s + overhead, per GPU, sequential.
|
||||
SATEstimatedNvidiaTargetedPowerPerGPUSec = 350
|
||||
|
||||
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||
SATEstimatedNvidiaPulseTestSec = 5000
|
||||
|
||||
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||
SATEstimatedNvidiaInterconnectSec = 300
|
||||
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||
SATEstimatedNvidiaBandwidthSec = 2700
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
satFreeMemBytes = freeMemBytes
|
||||
|
||||
rocmSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
@@ -37,6 +87,12 @@ var (
|
||||
"/opt/rocm/bin/rvs",
|
||||
"/opt/rocm-*/bin/rvs",
|
||||
}
|
||||
dcgmProfTesterCandidates = []string{
|
||||
"dcgmproftester",
|
||||
"dcgmproftester13",
|
||||
"dcgmproftester12",
|
||||
"dcgmproftester11",
|
||||
}
|
||||
)
|
||||
|
||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||
@@ -75,15 +131,46 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
Name string
|
||||
MemoryMB int
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
MemoryMB int `json:"memory_mb"`
|
||||
}
|
||||
|
||||
type NvidiaGPUStatus struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
BDF string `json:"bdf,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Status string `json:"status"`
|
||||
RawLine string `json:"raw_line,omitempty"`
|
||||
NeedsReset bool `json:"needs_reset"`
|
||||
ParseFailure bool `json:"parse_failure,omitempty"`
|
||||
}
|
||||
|
||||
type nvidiaGPUHealth struct {
|
||||
Index int
|
||||
Name string
|
||||
NeedsReset bool
|
||||
RawLine string
|
||||
ParseFailure bool
|
||||
}
|
||||
|
||||
type nvidiaGPUStatusFile struct {
|
||||
Index int
|
||||
Name string
|
||||
RunStatus string
|
||||
Reason string
|
||||
Health string
|
||||
HealthRaw string
|
||||
Observed bool
|
||||
Selected bool
|
||||
FailingJob string
|
||||
}
|
||||
|
||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||
type AMDGPUInfo struct {
|
||||
Index int
|
||||
Name string
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||
@@ -255,25 +342,213 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
MemoryMB: memMB,
|
||||
})
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool {
|
||||
return gpus[i].Index < gpus[j].Index
|
||||
})
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
|
||||
out, err := satExecCommand(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var gpus []NvidiaGPUStatus
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 4 {
|
||||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
upper := strings.ToUpper(line)
|
||||
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
|
||||
status := "OK"
|
||||
if needsReset {
|
||||
status = "RESET_REQUIRED"
|
||||
}
|
||||
gpus = append(gpus, NvidiaGPUStatus{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
|
||||
Serial: strings.TrimSpace(parts[3]),
|
||||
Status: status,
|
||||
RawLine: line,
|
||||
NeedsReset: needsReset,
|
||||
})
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func normalizeNvidiaBusID(v string) string {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
parts := strings.Split(v, ":")
|
||||
if len(parts) == 3 && len(parts[0]) > 4 {
|
||||
parts[0] = parts[0][len(parts[0])-4:]
|
||||
return strings.Join(parts, ":")
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
gpuCount := len(selected)
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, logFunc)
|
||||
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var (
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if len(selected) > 1 {
|
||||
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||
// of CUDA_VISIBLE_DEVICES.
|
||||
stagger := staggerSec
|
||||
if stagger < 0 {
|
||||
stagger = 0
|
||||
}
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(stagger),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: profEnv,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-power.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-pulse-test.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-nvbandwidth.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
@@ -285,15 +560,90 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
|
||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||
// ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
||||
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-stress.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
if len(gpuIndices) > 0 {
|
||||
return dedupeSortedIndices(gpuIndices), nil
|
||||
}
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
func memoryStressSizeArg() string {
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
return fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
availBytes := satFreeMemBytes()
|
||||
if availBytes <= 0 {
|
||||
return "80%"
|
||||
}
|
||||
availMB := availBytes / (1024 * 1024)
|
||||
targetMB := (availMB * 2) / 3
|
||||
if targetMB >= 256 {
|
||||
targetMB = (targetMB / 256) * 256
|
||||
}
|
||||
if targetMB <= 0 {
|
||||
return "80%"
|
||||
}
|
||||
return fmt.Sprintf("%dM", targetMB)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if sizeMB <= 0 {
|
||||
sizeMB = 256
|
||||
}
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||||
// intentionally conservative enough for healthy systems while avoiding the
|
||||
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||||
timeoutSec := sizeMB*passes*20/100 + 60
|
||||
if timeoutSec < 180 {
|
||||
timeoutSec = 180
|
||||
}
|
||||
if timeoutSec > 900 {
|
||||
timeoutSec = 900
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
@@ -303,11 +653,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||
}
|
||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||
sizeArg := "80%"
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
sizeArg = fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
// Base the default on current MemAvailable and keep headroom for the OS and
|
||||
// concurrent stressors so mixed burn runs do not trip the OOM killer.
|
||||
sizeArg := memoryStressSizeArg()
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||
@@ -349,7 +697,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -381,7 +729,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
||||
break
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath)
|
||||
commands := storageSATCommands(devPath, extended)
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
@@ -403,11 +751,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
type satJob struct {
|
||||
@@ -424,14 +768,24 @@ type satStats struct {
|
||||
Unsupported int
|
||||
}
|
||||
|
||||
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||
out := make([]satJob, 0, len(jobs)+1)
|
||||
out = append(out, satJob{
|
||||
name: "00-nvidia-smi-persistence-mode.log",
|
||||
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||
})
|
||||
out = append(out, jobs...)
|
||||
return out
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
@@ -446,11 +800,39 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
}
|
||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||
args := []string{"dcgmi", "diag", "-r", name}
|
||||
if durationSec > 0 {
|
||||
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
args = append(args, "-i", joinIndexList(gpuIndices))
|
||||
}
|
||||
return args
|
||||
}
|
||||
|
||||
func normalizeNvidiaBurnDuration(durationSec int) int {
|
||||
if durationSec <= 0 {
|
||||
return 300
|
||||
}
|
||||
return durationSec
|
||||
}
|
||||
|
||||
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
}
|
||||
return []string{
|
||||
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -470,11 +852,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{}
|
||||
selectedGPUIndices := map[int]struct{}{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
for _, idx := range job.gpuIndices {
|
||||
selectedGPUIndices[idx] = struct{}{}
|
||||
status := perGPU[idx]
|
||||
if status == nil {
|
||||
status = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = status
|
||||
}
|
||||
status.Selected = true
|
||||
}
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
for _, arg := range job.cmd {
|
||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||
@@ -483,17 +877,52 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
var out []byte
|
||||
var err error
|
||||
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
out = []byte(msg + "\n")
|
||||
err = healthErr
|
||||
}
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
}
|
||||
}
|
||||
|
||||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
|
||||
out = append(out, '\n')
|
||||
}
|
||||
out = append(out, []byte(msg+"\n")...)
|
||||
if err == nil {
|
||||
err = healthErr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
return "", ctx.Err()
|
||||
}
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
|
||||
for _, idx := range job.gpuIndices {
|
||||
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
|
||||
}
|
||||
}
|
||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
@@ -502,12 +931,204 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
if nvidiaPack {
|
||||
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
return archive, nil
|
||||
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||
entry := perGPU[idx]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = entry
|
||||
}
|
||||
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||
entry.RunStatus = status
|
||||
entry.FailingJob = jobName
|
||||
entry.Reason = firstLine(detail)
|
||||
}
|
||||
}
|
||||
|
||||
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
|
||||
health, err := readNvidiaGPUHealth()
|
||||
if err == nil {
|
||||
for _, gpu := range health {
|
||||
entry := perGPU[gpu.Index]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
|
||||
perGPU[gpu.Index] = entry
|
||||
}
|
||||
entry.Name = gpu.Name
|
||||
entry.Observed = true
|
||||
entry.HealthRaw = gpu.RawLine
|
||||
if gpu.NeedsReset {
|
||||
entry.Health = "RESET_REQUIRED"
|
||||
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||
entry.RunStatus = "FAILED"
|
||||
if strings.TrimSpace(entry.Reason) == "" {
|
||||
entry.Reason = "GPU requires reset"
|
||||
}
|
||||
}
|
||||
} else {
|
||||
entry.Health = "OK"
|
||||
}
|
||||
}
|
||||
}
|
||||
for idx := range selected {
|
||||
entry := perGPU[idx]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = entry
|
||||
}
|
||||
entry.Selected = true
|
||||
}
|
||||
var indices []int
|
||||
for idx := range perGPU {
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
sort.Ints(indices)
|
||||
for _, idx := range indices {
|
||||
entry := perGPU[idx]
|
||||
if entry.RunStatus == "" {
|
||||
entry.RunStatus = overall
|
||||
}
|
||||
if entry.Health == "" {
|
||||
entry.Health = "UNKNOWN"
|
||||
}
|
||||
if entry.Name == "" {
|
||||
entry.Name = "Unknown GPU"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
|
||||
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
|
||||
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
|
||||
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
|
||||
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
|
||||
if strings.TrimSpace(entry.FailingJob) != "" {
|
||||
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
|
||||
}
|
||||
if strings.TrimSpace(entry.Reason) != "" {
|
||||
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
|
||||
}
|
||||
if strings.TrimSpace(entry.HealthRaw) != "" {
|
||||
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func nvidiaSATStatusSeverity(status string) int {
|
||||
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||
case "FAILED":
|
||||
return 3
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func firstLine(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
|
||||
return strings.TrimSpace(s[:idx])
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func nvidiaJobNeedsHealthCheck(job satJob) bool {
|
||||
if job.collectGPU {
|
||||
return true
|
||||
}
|
||||
name := strings.ToLower(strings.TrimSpace(job.name))
|
||||
return strings.Contains(name, "dcgmi") ||
|
||||
strings.Contains(name, "gpu-burn") ||
|
||||
strings.Contains(name, "gpu-stress") ||
|
||||
strings.Contains(name, "dcgmproftester")
|
||||
}
|
||||
|
||||
func checkNvidiaJobHealth(selected []int) (string, error) {
|
||||
health, err := readNvidiaGPUHealth()
|
||||
if err != nil {
|
||||
return "", nil
|
||||
}
|
||||
var bad []nvidiaGPUHealth
|
||||
selectedSet := make(map[int]struct{}, len(selected))
|
||||
for _, idx := range selected {
|
||||
selectedSet[idx] = struct{}{}
|
||||
}
|
||||
for _, gpu := range health {
|
||||
if len(selectedSet) > 0 {
|
||||
if _, ok := selectedSet[gpu.Index]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if gpu.NeedsReset {
|
||||
bad = append(bad, gpu)
|
||||
}
|
||||
}
|
||||
if len(bad) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
lines := make([]string, 0, len(bad)+1)
|
||||
lines = append(lines, "NVIDIA GPU health check failed:")
|
||||
for _, gpu := range bad {
|
||||
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
|
||||
}
|
||||
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
|
||||
}
|
||||
|
||||
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
|
||||
out, err := satExecCommand(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
return parseNvidiaGPUHealth(string(out)), nil
|
||||
}
|
||||
|
||||
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
|
||||
var gpus []nvidiaGPUHealth
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 2 {
|
||||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
upper := strings.ToUpper(line)
|
||||
gpus = append(gpus, nvidiaGPUHealth{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
|
||||
RawLine: line,
|
||||
})
|
||||
}
|
||||
return gpus
|
||||
}
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||
@@ -531,6 +1152,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
c.Cancel = func() error {
|
||||
if c.Process != nil {
|
||||
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
@@ -557,17 +1185,25 @@ func listStorageDevices() ([]string, error) {
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||
selfTestLevel := "1"
|
||||
if extended {
|
||||
selfTestLevel = "2"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||
}
|
||||
}
|
||||
smartTestType := "short"
|
||||
if extended {
|
||||
smartTestType = "long"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -616,6 +1252,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
}
|
||||
if strings.Contains(text, "unsupported") ||
|
||||
strings.Contains(text, "not supported") ||
|
||||
strings.Contains(text, "not found in path") ||
|
||||
strings.Contains(text, "invalid opcode") ||
|
||||
strings.Contains(text, "unknown command") ||
|
||||
strings.Contains(text, "not implemented") ||
|
||||
@@ -625,6 +1262,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
// nvidia-smi on a machine with no NVIDIA GPU
|
||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||
strings.Contains(text, "no nvidia gpu") ||
|
||||
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
|
||||
// while waiting, so the CLI stops polling without proving device failure.
|
||||
(strings.Contains(name, "self-test") &&
|
||||
strings.Contains(text, "no progress for") &&
|
||||
strings.Contains(text, "stop waiting")) ||
|
||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
@@ -684,7 +1326,11 @@ func resolveSATCommand(cmd []string) ([]string, error) {
|
||||
case "rvs":
|
||||
return resolveRVSCommand(cmd[1:]...)
|
||||
}
|
||||
return cmd, nil
|
||||
path, err := satLookPath(cmd[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||||
}
|
||||
return append([]string{path}, cmd[1:]...), nil
|
||||
}
|
||||
|
||||
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||
@@ -718,6 +1364,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
|
||||
for _, candidate := range dcgmProfTesterCandidates {
|
||||
if path, err := satLookPath(candidate); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
}
|
||||
return nil, errors.New("dcgmproftester not found in PATH")
|
||||
}
|
||||
|
||||
func ensureAMDRuntimeReady() error {
|
||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||
return nil
|
||||
@@ -816,8 +1471,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
||||
if len(metricRows) > 0 {
|
||||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||
chart := RenderGPUTerminalChart(metricRows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
||||
}
|
||||
|
||||
return out, err
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -20,7 +21,7 @@ type FanStressOptions struct {
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
@@ -42,13 +43,54 @@ type GPUStressMetric struct {
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64
|
||||
SysPowerSource string
|
||||
SysPowerMode string
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
Source string
|
||||
Mode string
|
||||
Reason string
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type fanObservationState struct {
|
||||
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||
}
|
||||
|
||||
type fanPeakCandidate struct {
|
||||
FirstSeen time.Time
|
||||
RPM float64
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
fanObservationMu sync.Mutex
|
||||
fanObservation fanObservationState
|
||||
fanObservationInit bool
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||
|
||||
const fanObservationMinPeakHold = time.Second
|
||||
|
||||
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||
if rpm <= 0 {
|
||||
return 0
|
||||
}
|
||||
return math.Ceil(rpm/1000.0) * 1000.0
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
@@ -211,11 +253,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
@@ -231,9 +269,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
@@ -246,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
|
||||
return row
|
||||
}
|
||||
|
||||
@@ -303,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
@@ -316,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
func loadFanObservationLocked() {
|
||||
if fanObservationInit {
|
||||
return
|
||||
}
|
||||
fanObservationInit = true
|
||||
fanObservation.MaxRPM = make(map[string]float64)
|
||||
raw, err := os.ReadFile(fanObservationStatePath)
|
||||
if err != nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted fanObservationState
|
||||
if json.Unmarshal(raw, &persisted) != nil {
|
||||
return
|
||||
}
|
||||
for name, rpm := range persisted.MaxRPM {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
fanObservation.MaxRPM[name] = rpm
|
||||
}
|
||||
}
|
||||
|
||||
func saveFanObservationLocked() {
|
||||
if len(fanObservation.MaxRPM) == 0 {
|
||||
return
|
||||
}
|
||||
dir := filepath.Dir(fanObservationStatePath)
|
||||
if dir == "" || dir == "." {
|
||||
dir = "/var/log/bee-sat"
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return
|
||||
}
|
||||
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||
}
|
||||
|
||||
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||
if len(fans) == 0 {
|
||||
return
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
changed := false
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
currentMax := fanObservation.MaxRPM[name]
|
||||
if fan.RPM <= currentMax {
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if cand, ok := fanPeakCandidates[name]; ok {
|
||||
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||
newMax := math.Max(cand.RPM, fan.RPM)
|
||||
if newMax > currentMax {
|
||||
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||
changed = true
|
||||
}
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if fan.RPM > cand.RPM {
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||
}
|
||||
continue
|
||||
}
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||
}
|
||||
if changed {
|
||||
saveFanObservationLocked()
|
||||
}
|
||||
}
|
||||
|
||||
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
var samples []float64
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
maxRPM := fanObservation.MaxRPM[name]
|
||||
if maxRPM <= 0 {
|
||||
continue
|
||||
}
|
||||
pct := fan.RPM / maxRPM * 100.0
|
||||
if pct > 100 {
|
||||
pct = 100
|
||||
}
|
||||
if pct < 0 {
|
||||
pct = 0
|
||||
}
|
||||
samples = append(samples, pct)
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
@@ -419,6 +571,116 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
return fans, nil
|
||||
}
|
||||
|
||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||
// Returns the average duty cycle across all exposed PWM controls.
|
||||
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
fans, fanErr := sampleFanSpeeds()
|
||||
if fanErr != nil {
|
||||
return 0, false, false
|
||||
}
|
||||
return sampleFanDutyCyclePctFromFans(fans)
|
||||
}
|
||||
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||
return pct, ok, false
|
||||
}
|
||||
|
||||
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false, false
|
||||
}
|
||||
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||
return pct, true, true
|
||||
}
|
||||
return 0, false, false
|
||||
}
|
||||
|
||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
return 0, false
|
||||
}
|
||||
var samples []float64
|
||||
for _, features := range doc {
|
||||
for name, feature := range features {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
featureMap, ok := feature.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if duty, ok := firstFanDutyValue(name, featureMap); ok {
|
||||
samples = append(samples, duty)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
|
||||
featureName = strings.ToLower(strings.TrimSpace(featureName))
|
||||
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
|
||||
return 0, false
|
||||
}
|
||||
if strings.Contains(featureName, "pwm") {
|
||||
for _, key := range []string{"input", "value", "current"} {
|
||||
if value, ok := feature[key]; ok {
|
||||
if duty, parsed := parseFanDutyValue(value); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "pwm") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
|
||||
continue
|
||||
}
|
||||
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func parseFanDutyValue(value any) (float64, bool) {
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return normalizePWMAsDutyPct(v)
|
||||
case string:
|
||||
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
|
||||
return normalizePWMAsDutyPct(f)
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
|
||||
if raw < 0 {
|
||||
return 0, false
|
||||
}
|
||||
if raw <= 100 {
|
||||
return raw, true
|
||||
}
|
||||
if raw <= 255 {
|
||||
return raw / 255.0 * 100.0, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
@@ -506,13 +768,19 @@ func sampleCPUTempViaSensors() float64 {
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
// sampleSystemPowerResolved reads system power via the global autotune source,
|
||||
// falling back to the historical heuristic before autotune or when degraded.
|
||||
func sampleSystemPowerResolved() (float64, string, string) {
|
||||
now := time.Now()
|
||||
current, decision, err := SampleSystemPowerResolved("")
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
if err != nil {
|
||||
return 0
|
||||
current = 0
|
||||
}
|
||||
return parseDCMIPowerReading(string(out))
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
|
||||
systemPowerCache = updated
|
||||
return value, updated.Source, updated.Mode
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
@@ -535,6 +803,17 @@ func parseDCMIPowerReading(raw string) float64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
return cache.Value, cache
|
||||
}
|
||||
return 0, cache
|
||||
}
|
||||
|
||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||
// during either load phase.
|
||||
func analyzeThrottling(rows []FanStressRow) bool {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParseFanSpeeds(t *testing.T) {
|
||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||
@@ -25,3 +29,108 @@ func TestFirstFanInputValue(t *testing.T) {
|
||||
t.Fatalf("got=%v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||
raw := []byte(`{
|
||||
"chip0": {
|
||||
"fan1": {"input": 9000},
|
||||
"pwm1": {"input": 128},
|
||||
"pwm1_enable": {"input": 1}
|
||||
},
|
||||
"chip1": {
|
||||
"pwm2": {"input": 64}
|
||||
}
|
||||
}`)
|
||||
|
||||
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
|
||||
if !ok {
|
||||
t.Fatalf("expected duty cycle telemetry to be parsed")
|
||||
}
|
||||
if got < 57 || got > 58 {
|
||||
t.Fatalf("got=%v want ~57.1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldPath := fanObservationStatePath
|
||||
oldState := fanObservation
|
||||
oldInit := fanObservationInit
|
||||
oldCandidates := fanPeakCandidates
|
||||
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
t.Cleanup(func() {
|
||||
fanObservationStatePath = oldPath
|
||||
fanObservation = oldState
|
||||
fanObservationInit = oldInit
|
||||
fanPeakCandidates = oldCandidates
|
||||
})
|
||||
|
||||
start := time.Unix(100, 0)
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||
t.Fatalf("single-sample spike should not establish observed max")
|
||||
}
|
||||
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||
|
||||
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("got=%v want ~43.3", got)
|
||||
}
|
||||
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
Minimum during sampling period: 498 Watts
|
||||
`
|
||||
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
if updated.Value != 480 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
if updated.Value != 530 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,23 +1,25 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1")
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda")
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
}
|
||||
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
|
||||
jobs := nvidiaSATJobs()
|
||||
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
if len(jobs) != 6 {
|
||||
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
}
|
||||
}
|
||||
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
got := jobs[5].cmd
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -162,6 +183,189 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
got, err := resolveDCGMGPUIndices(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "0,1,2"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "1,3"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len=%d want 2", len(got))
|
||||
}
|
||||
if got[0].NeedsReset {
|
||||
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||
}
|
||||
if !got[1].NeedsReset {
|
||||
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
msg, err := checkNvidiaJobHealth([]int{1})
|
||||
if err == nil {
|
||||
t.Fatal("expected health check error")
|
||||
}
|
||||
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||
t.Fatalf("unexpected message: %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||
0: {Index: 0, RunStatus: "OK"},
|
||||
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||
}
|
||||
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
if !strings.Contains(text, "run_status=FAILED") {
|
||||
t.Fatalf("missing run status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||
t.Fatalf("missing health status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||
t.Fatalf("missing failing job:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
switch file {
|
||||
case "dcgmproftester13":
|
||||
return "/usr/bin/dcgmproftester13", nil
|
||||
default:
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 4 {
|
||||
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/dcgmproftester13" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
|
||||
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||
}
|
||||
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||
}
|
||||
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||
{loader: "", want: "gpu-nvidia-burn"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvIntFallback(t *testing.T) {
|
||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
@@ -177,6 +381,37 @@ func TestEnvIntFallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "65536M" {
|
||||
t.Fatalf("sizeArg=%q want 65536M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
|
||||
|
||||
if got := memoryStressSizeArg(); got != "4096M" {
|
||||
t.Fatalf("sizeArg=%q want 4096M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 0 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "80%" {
|
||||
t.Fatalf("sizeArg=%q want 80%%", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifySATResult(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -187,6 +422,7 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
@@ -201,6 +437,38 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
cancel()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||
}, nil)
|
||||
<-done
|
||||
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Fatalf("err=%v want context.Canceled", err)
|
||||
}
|
||||
if archive != "" {
|
||||
t.Fatalf("archive=%q want empty", archive)
|
||||
}
|
||||
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||
if globErr != nil {
|
||||
t.Fatalf("Glob error: %v", globErr)
|
||||
}
|
||||
if len(matches) != 0 {
|
||||
t.Fatalf("archives=%v want none", matches)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -237,6 +505,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "stress-ng" {
|
||||
return "/usr/bin/stress-ng", nil
|
||||
}
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveSATCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 3 {
|
||||
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/stress-ng" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||
t.Fatalf("error=%q", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||
|
||||
@@ -10,17 +10,30 @@ import (
|
||||
func (s *System) ListBeeServices() ([]string, error) {
|
||||
seen := map[string]bool{}
|
||||
var out []string
|
||||
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
||||
for _, pattern := range []string{
|
||||
"/etc/systemd/system/bee-*.service",
|
||||
"/lib/systemd/system/bee-*.service",
|
||||
"/etc/systemd/system/bee-*.timer",
|
||||
"/lib/systemd/system/bee-*.timer",
|
||||
} {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, match := range matches {
|
||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||
base := filepath.Base(match)
|
||||
name := base
|
||||
if strings.HasSuffix(base, ".service") {
|
||||
name = strings.TrimSuffix(base, ".service")
|
||||
}
|
||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||
if strings.HasSuffix(name, "@") {
|
||||
continue
|
||||
}
|
||||
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
|
||||
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
|
||||
continue
|
||||
}
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
out = append(out, name)
|
||||
@@ -48,7 +61,12 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return runNvidiaRecover("restart-drivers")
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
|
||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
|
||||
@@ -2,6 +2,24 @@ package platform
|
||||
|
||||
type System struct{}
|
||||
|
||||
type LiveBootSource struct {
|
||||
InRAM bool `json:"in_ram"`
|
||||
Kind string `json:"kind"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type LiveMediaRAMState struct {
|
||||
LiveBootSource
|
||||
State string `json:"state"`
|
||||
Status string `json:"status"`
|
||||
ToramActive bool `json:"toram_active,omitempty"`
|
||||
CopyPresent bool `json:"copy_present,omitempty"`
|
||||
CopyComplete bool `json:"copy_complete,omitempty"`
|
||||
CanStartCopy bool `json:"can_start_copy,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
@@ -37,12 +55,12 @@ type StaticIPv4Config struct {
|
||||
}
|
||||
|
||||
type RemovableTarget struct {
|
||||
Device string
|
||||
FSType string
|
||||
Size string
|
||||
Label string
|
||||
Model string
|
||||
Mountpoint string
|
||||
Device string `json:"device"`
|
||||
FSType string `json:"fs_type"`
|
||||
Size string `json:"size"`
|
||||
Label string `json:"label"`
|
||||
Model string `json:"model"`
|
||||
Mountpoint string `json:"mountpoint"`
|
||||
}
|
||||
|
||||
type ToolStatus struct {
|
||||
@@ -63,6 +81,7 @@ type NvidiaStressOptions struct {
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
StaggerSeconds int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
|
||||
31
audit/internal/platform/types_test.go
Normal file
31
audit/internal/platform/types_test.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
data, err := json.Marshal(RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
FSType: "exfat",
|
||||
Size: "1.8T",
|
||||
Label: "USB",
|
||||
Model: "Flash",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
raw := string(data)
|
||||
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
|
||||
if !strings.Contains(raw, key) {
|
||||
t.Fatalf("json missing key %s: %s", key, raw)
|
||||
}
|
||||
}
|
||||
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
|
||||
t.Fatalf("json still contains Go field names: %s", raw)
|
||||
}
|
||||
}
|
||||
@@ -15,12 +15,17 @@ type HardwareIngestRequest struct {
|
||||
}
|
||||
|
||||
type RuntimeHealth struct {
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
@@ -182,6 +187,13 @@ type HardwarePCIeDevice struct {
|
||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
333
audit/internal/webui/api_test.go
Normal file
333
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,333 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||
req.ContentLength = -1
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-perf" {
|
||||
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
|
||||
}
|
||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||
}
|
||||
if task.params.RunNCCL {
|
||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var resp taskRunResponse
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if len(resp.TaskIDs) != 2 {
|
||||
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
|
||||
// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-power" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
if task.params.RampTotal != 3 {
|
||||
t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-autotune" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
|
||||
}
|
||||
if task.params.BenchmarkKind != "power-fit" {
|
||||
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||
got := []int{
|
||||
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||
defaultTaskPriority("audit", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-power", taskParams{}),
|
||||
}
|
||||
want := []int{
|
||||
taskPriorityInstallToRAM,
|
||||
taskPriorityAudit,
|
||||
taskPriorityValidate,
|
||||
taskPriorityValidateStress,
|
||||
taskPriorityBurn,
|
||||
taskPriorityBenchmark,
|
||||
taskPriorityBenchmark,
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
|
||||
t.Fatalf("priority order=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_A", RPM: 4200},
|
||||
{Name: "FAN_B", RPM: 5100},
|
||||
})
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_B", RPM: 5200},
|
||||
})
|
||||
|
||||
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||
t.Fatalf("fanNames=%v", h.fanNames)
|
||||
}
|
||||
aVals, _ := h.ringFans[0].snapshot()
|
||||
bVals, _ := h.ringFans[1].snapshot()
|
||||
if len(aVals) != 2 || len(bVals) != 2 {
|
||||
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||
}
|
||||
if aVals[1] != 4200 {
|
||||
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||
}
|
||||
if bVals[1] != 5200 {
|
||||
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||
}
|
||||
}
|
||||
992
audit/internal/webui/charts_svg.go
Normal file
992
audit/internal/webui/charts_svg.go
Normal file
@@ -0,0 +1,992 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type chartTimelineSegment struct {
|
||||
Start time.Time
|
||||
End time.Time
|
||||
Active bool
|
||||
}
|
||||
|
||||
type chartScale struct {
|
||||
Min float64
|
||||
Max float64
|
||||
Ticks []float64
|
||||
}
|
||||
|
||||
type chartLayout struct {
|
||||
Width int
|
||||
Height int
|
||||
PlotLeft int
|
||||
PlotRight int
|
||||
PlotTop int
|
||||
PlotBottom int
|
||||
}
|
||||
|
||||
type metricChartSeries struct {
|
||||
Name string
|
||||
AxisTitle string
|
||||
Color string
|
||||
Values []float64
|
||||
}
|
||||
|
||||
var metricChartPalette = []string{
|
||||
"#5794f2",
|
||||
"#73bf69",
|
||||
"#f2cc0c",
|
||||
"#ff9830",
|
||||
"#f2495c",
|
||||
"#b877d9",
|
||||
"#56d2f7",
|
||||
"#8ab8ff",
|
||||
"#9adf8f",
|
||||
"#ffbe5c",
|
||||
}
|
||||
|
||||
var gpuLabelCache struct {
|
||||
mu sync.Mutex
|
||||
loadedAt time.Time
|
||||
byIndex map[int]string
|
||||
}
|
||||
|
||||
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{time.Time{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points (one per pixel) before building SVG.
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
statsLabel := chartStatsLabel(datasets)
|
||||
|
||||
legendItems := []metricChartSeries{}
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
values := make([]float64, pointCount)
|
||||
if i < len(datasets) {
|
||||
copy(values, coalesceDataset(datasets[i], pointCount))
|
||||
}
|
||||
legendItems = append(legendItems, metricChartSeries{
|
||||
Name: name,
|
||||
Color: color,
|
||||
Values: values,
|
||||
})
|
||||
}
|
||||
|
||||
scale := singleAxisChartScale(datasets, yMin, yMax)
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
for _, item := range legendItems {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
|
||||
}
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||
if temp == nil && power == nil && coreClock == nil {
|
||||
return nil, false, nil
|
||||
}
|
||||
labels := sampleTimeLabels(samples)
|
||||
times := sampleTimes(samples)
|
||||
svg, err := drawGPUOverviewChartSVG(
|
||||
gpuDisplayLabel(idx)+" Overview",
|
||||
labels,
|
||||
times,
|
||||
[]metricChartSeries{
|
||||
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
|
||||
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
|
||||
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
|
||||
},
|
||||
timeline,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
return svg, true, nil
|
||||
}
|
||||
|
||||
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
if len(series) != 3 {
|
||||
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
|
||||
}
|
||||
const (
|
||||
width = 1400
|
||||
height = 840
|
||||
plotLeft = 180
|
||||
plotRight = 1220
|
||||
plotTop = 96
|
||||
plotBottom = 660
|
||||
)
|
||||
const (
|
||||
leftOuterAxis = 72
|
||||
leftInnerAxis = 132
|
||||
rightInnerAxis = 1268
|
||||
)
|
||||
layout := chartLayout{
|
||||
Width: width,
|
||||
Height: height,
|
||||
PlotLeft: plotLeft,
|
||||
PlotRight: plotRight,
|
||||
PlotTop: plotTop,
|
||||
PlotBottom: plotBottom,
|
||||
}
|
||||
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{time.Time{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range series {
|
||||
if len(series[i].Values) == 0 {
|
||||
series[i].Values = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points before building SVG.
|
||||
{
|
||||
datasets := make([][]float64, len(series))
|
||||
for i := range series {
|
||||
datasets[i] = series[i].Values
|
||||
}
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
for i := range series {
|
||||
series[i].Values = datasets[i]
|
||||
}
|
||||
}
|
||||
|
||||
scales := make([]chartScale, len(series))
|
||||
for i := range series {
|
||||
min, max := chartSeriesBounds(series[i].Values)
|
||||
ticks := chartNiceTicks(min, max, 8)
|
||||
scales[i] = chartScale{
|
||||
Min: ticks[0],
|
||||
Max: ticks[len(ticks)-1],
|
||||
Ticks: ticks,
|
||||
}
|
||||
}
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, width, height)
|
||||
writeChartFrame(&b, title, "", width, height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scales[0])
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
|
||||
for i, axisLineX := range axisX {
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
|
||||
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
|
||||
for _, tick := range scales[i].Ticks {
|
||||
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
|
||||
label := sanitizeChartText(chartYAxisNumber(tick))
|
||||
if i < 2 {
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, y, axisLineX+6, y, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||
axisLineX-8, y, series[i].Color, label)
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, y, axisLineX-6, y, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||
axisLineX+8, y, series[i].Color, label)
|
||||
}
|
||||
}
|
||||
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
for i := range series {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
|
||||
}
|
||||
writeLegend(&b, layout, series)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
|
||||
if len(samples) == 0 {
|
||||
return nil
|
||||
}
|
||||
times := sampleTimes(samples)
|
||||
start, end := chartTimeBounds(times)
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil
|
||||
}
|
||||
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
|
||||
}
|
||||
|
||||
func snapshotTaskHistory() []Task {
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
out := make([]Task, len(globalQueue.tasks))
|
||||
for i, t := range globalQueue.tasks {
|
||||
out[i] = *t
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil
|
||||
}
|
||||
if end.Before(start) {
|
||||
start, end = end, start
|
||||
}
|
||||
type interval struct {
|
||||
start time.Time
|
||||
end time.Time
|
||||
}
|
||||
active := make([]interval, 0, len(tasks))
|
||||
for _, task := range tasks {
|
||||
if task.StartedAt == nil {
|
||||
continue
|
||||
}
|
||||
intervalStart := task.StartedAt.UTC()
|
||||
intervalEnd := now.UTC()
|
||||
if task.DoneAt != nil {
|
||||
intervalEnd = task.DoneAt.UTC()
|
||||
}
|
||||
if !intervalEnd.After(intervalStart) {
|
||||
continue
|
||||
}
|
||||
if intervalEnd.Before(start) || intervalStart.After(end) {
|
||||
continue
|
||||
}
|
||||
if intervalStart.Before(start) {
|
||||
intervalStart = start
|
||||
}
|
||||
if intervalEnd.After(end) {
|
||||
intervalEnd = end
|
||||
}
|
||||
active = append(active, interval{start: intervalStart, end: intervalEnd})
|
||||
}
|
||||
sort.Slice(active, func(i, j int) bool {
|
||||
if active[i].start.Equal(active[j].start) {
|
||||
return active[i].end.Before(active[j].end)
|
||||
}
|
||||
return active[i].start.Before(active[j].start)
|
||||
})
|
||||
merged := make([]interval, 0, len(active))
|
||||
for _, span := range active {
|
||||
if len(merged) == 0 {
|
||||
merged = append(merged, span)
|
||||
continue
|
||||
}
|
||||
last := &merged[len(merged)-1]
|
||||
if !span.start.After(last.end) {
|
||||
if span.end.After(last.end) {
|
||||
last.end = span.end
|
||||
}
|
||||
continue
|
||||
}
|
||||
merged = append(merged, span)
|
||||
}
|
||||
|
||||
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
|
||||
cursor := start
|
||||
for _, span := range merged {
|
||||
if span.start.After(cursor) {
|
||||
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
|
||||
}
|
||||
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
|
||||
cursor = span.end
|
||||
}
|
||||
if cursor.Before(end) {
|
||||
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
|
||||
}
|
||||
if len(segments) == 0 {
|
||||
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
|
||||
}
|
||||
return segments
|
||||
}
|
||||
|
||||
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
|
||||
times := make([]time.Time, 0, len(samples))
|
||||
for _, sample := range samples {
|
||||
times = append(times, sample.Timestamp)
|
||||
}
|
||||
return times
|
||||
}
|
||||
|
||||
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
|
||||
min, max := 0.0, 1.0
|
||||
if yMin != nil && yMax != nil {
|
||||
min, max = *yMin, *yMax
|
||||
} else {
|
||||
min, max = chartSeriesBounds(flattenDatasets(datasets))
|
||||
if yMin != nil {
|
||||
min = *yMin
|
||||
}
|
||||
if yMax != nil {
|
||||
max = *yMax
|
||||
}
|
||||
}
|
||||
ticks := chartNiceTicks(min, max, 8)
|
||||
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
|
||||
}
|
||||
|
||||
func flattenDatasets(datasets [][]float64) []float64 {
|
||||
total := 0
|
||||
for _, ds := range datasets {
|
||||
total += len(ds)
|
||||
}
|
||||
out := make([]float64, 0, total)
|
||||
for _, ds := range datasets {
|
||||
out = append(out, ds...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
|
||||
legendRows := 0
|
||||
if chartLegendVisible(seriesCount) && seriesCount > 0 {
|
||||
cols := 4
|
||||
if seriesCount < cols {
|
||||
cols = seriesCount
|
||||
}
|
||||
legendRows = (seriesCount + cols - 1) / cols
|
||||
}
|
||||
legendHeight := 0
|
||||
if legendRows > 0 {
|
||||
legendHeight = legendRows*24 + 24
|
||||
}
|
||||
return chartLayout{
|
||||
Width: 1400,
|
||||
Height: canvasHeight,
|
||||
PlotLeft: 96,
|
||||
PlotRight: 1352,
|
||||
PlotTop: 72,
|
||||
PlotBottom: canvasHeight - 60 - legendHeight,
|
||||
}
|
||||
}
|
||||
|
||||
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
|
||||
if len(times) == 0 {
|
||||
return time.Time{}, time.Time{}
|
||||
}
|
||||
start := times[0].UTC()
|
||||
end := start
|
||||
for _, ts := range times[1:] {
|
||||
t := ts.UTC()
|
||||
if t.Before(start) {
|
||||
start = t
|
||||
}
|
||||
if t.After(end) {
|
||||
end = t
|
||||
}
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||
if count <= 0 {
|
||||
return nil
|
||||
}
|
||||
if len(times) == count {
|
||||
return times
|
||||
}
|
||||
if len(times) == 1 {
|
||||
out := make([]time.Time, count)
|
||||
for i := range out {
|
||||
out[i] = times[0].Add(time.Duration(i) * time.Minute)
|
||||
}
|
||||
return out
|
||||
}
|
||||
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
|
||||
out := make([]time.Time, count)
|
||||
for i := range out {
|
||||
out[i] = base.Add(time.Duration(i) * time.Minute)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||
// power charts where the filled area of each PSU shows its individual
|
||||
// contribution and the total height equals the combined draw.
|
||||
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
// Build cumulative sums per time point.
|
||||
cumulative := make([][]float64, len(datasets)+1)
|
||||
for i := range cumulative {
|
||||
cumulative[i] = make([]float64, pointCount)
|
||||
}
|
||||
for i, ds := range datasets {
|
||||
for j, v := range ds {
|
||||
cumulative[i+1][j] = cumulative[i][j] + v
|
||||
}
|
||||
}
|
||||
|
||||
// Scale is based on the total (top cumulative row).
|
||||
total := cumulative[len(cumulative)-1]
|
||||
yMin := floatPtr(0)
|
||||
if yMax == nil {
|
||||
yMax = autoMax120(total)
|
||||
}
|
||||
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||
|
||||
legendItems := make([]metricChartSeries, len(datasets))
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||
}
|
||||
|
||||
// Stats label from totals.
|
||||
statsLabel := chartStatsLabel([][]float64{total})
|
||||
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
|
||||
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
// Draw border polylines on top.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||
// (baseline and top), using the given color at 55% opacity.
|
||||
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||
n := len(top)
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
if len(baseline) < n {
|
||||
baseline = make([]float64, n)
|
||||
}
|
||||
|
||||
// Forward path along top values, then backward along baseline values.
|
||||
var points strings.Builder
|
||||
for i := 0; i < n; i++ {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
if i > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
for i := n - 1; i >= 0; i-- {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteByte(' ')
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||
}
|
||||
|
||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||
}
|
||||
|
||||
func writeSVGClose(b *strings.Builder) {
|
||||
b.WriteString("</svg>\n")
|
||||
}
|
||||
|
||||
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(title))
|
||||
if strings.TrimSpace(subtitle) != "" {
|
||||
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(subtitle))
|
||||
}
|
||||
}
|
||||
|
||||
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
|
||||
}
|
||||
|
||||
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
|
||||
for _, tick := range scale.Ticks {
|
||||
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||
layout.PlotLeft, y, layout.PlotRight, y)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
|
||||
if pointCount <= 0 {
|
||||
return
|
||||
}
|
||||
start, end := chartTimeBounds(times)
|
||||
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
|
||||
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||
ts := chartPointTime(times, idx)
|
||||
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||
x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
|
||||
for _, tick := range scale.Ticks {
|
||||
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, y, layout.PlotLeft-6, y)
|
||||
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
|
||||
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
|
||||
}
|
||||
}
|
||||
|
||||
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
|
||||
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
label := ""
|
||||
if idx < len(labels) {
|
||||
label = labels[idx]
|
||||
}
|
||||
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
|
||||
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
|
||||
}
|
||||
|
||||
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
|
||||
if len(values) == 0 {
|
||||
return
|
||||
}
|
||||
var points strings.Builder
|
||||
for idx, value := range values {
|
||||
if idx > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
|
||||
points.String(), color)
|
||||
if len(values) == 1 {
|
||||
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||
return
|
||||
}
|
||||
peakIdx := 0
|
||||
peakValue := values[0]
|
||||
for idx, value := range values[1:] {
|
||||
if value >= peakValue {
|
||||
peakIdx = idx + 1
|
||||
peakValue = value
|
||||
}
|
||||
}
|
||||
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||
}
|
||||
|
||||
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||
if !chartLegendVisible(len(series)) || len(series) == 0 {
|
||||
return
|
||||
}
|
||||
cols := 4
|
||||
if len(series) < cols {
|
||||
cols = len(series)
|
||||
}
|
||||
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
|
||||
baseY := layout.PlotBottom + 74
|
||||
for i, item := range series {
|
||||
row := i / cols
|
||||
col := i % cols
|
||||
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
|
||||
y := float64(baseY + row*24)
|
||||
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
|
||||
x, y, x+28, y, item.Color)
|
||||
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
|
||||
x+38, y+4, sanitizeChartText(item.Name))
|
||||
}
|
||||
}
|
||||
|
||||
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||
if len(segments) == 0 {
|
||||
return
|
||||
}
|
||||
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
|
||||
for _, segment := range segments {
|
||||
if segment.Active || !segment.End.After(segment.Start) {
|
||||
continue
|
||||
}
|
||||
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
|
||||
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||
if len(segments) == 0 {
|
||||
return
|
||||
}
|
||||
seen := map[int]bool{}
|
||||
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
|
||||
for i, segment := range segments {
|
||||
if i > 0 {
|
||||
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||
if !seen[x] {
|
||||
seen[x] = true
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
}
|
||||
if i < len(segments)-1 {
|
||||
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||
if !seen[x] {
|
||||
seen[x] = true
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
}
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
// downsampleTimeSeries reduces the time series to at most maxPts points using
|
||||
// min-max bucketing. Each bucket contributes the index of its min and max value
|
||||
// (using the first full-length dataset as the reference series). All parallel
|
||||
// datasets are sampled at those same indices so all series stay aligned.
|
||||
// If len(times) <= maxPts the inputs are returned unchanged.
|
||||
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
|
||||
n := len(times)
|
||||
if n <= maxPts || maxPts <= 0 {
|
||||
return times, datasets
|
||||
}
|
||||
buckets := maxPts / 2
|
||||
if buckets < 1 {
|
||||
buckets = 1
|
||||
}
|
||||
// Use the first dataset that has the same length as times as the reference
|
||||
// for deciding which two indices to keep per bucket.
|
||||
var ref []float64
|
||||
for _, ds := range datasets {
|
||||
if len(ds) == n {
|
||||
ref = ds
|
||||
break
|
||||
}
|
||||
}
|
||||
selected := make([]int, 0, maxPts)
|
||||
bucketSize := float64(n) / float64(buckets)
|
||||
for b := 0; b < buckets; b++ {
|
||||
lo := int(math.Round(float64(b) * bucketSize))
|
||||
hi := int(math.Round(float64(b+1) * bucketSize))
|
||||
if hi > n {
|
||||
hi = n
|
||||
}
|
||||
if lo >= hi {
|
||||
continue
|
||||
}
|
||||
if ref == nil {
|
||||
selected = append(selected, lo)
|
||||
if hi-1 != lo {
|
||||
selected = append(selected, hi-1)
|
||||
}
|
||||
continue
|
||||
}
|
||||
minIdx, maxIdx := lo, lo
|
||||
for i := lo + 1; i < hi; i++ {
|
||||
if ref[i] < ref[minIdx] {
|
||||
minIdx = i
|
||||
}
|
||||
if ref[i] > ref[maxIdx] {
|
||||
maxIdx = i
|
||||
}
|
||||
}
|
||||
if minIdx <= maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
if maxIdx != minIdx {
|
||||
selected = append(selected, maxIdx)
|
||||
}
|
||||
} else {
|
||||
selected = append(selected, maxIdx)
|
||||
if minIdx != maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
}
|
||||
}
|
||||
}
|
||||
outTimes := make([]time.Time, len(selected))
|
||||
for i, idx := range selected {
|
||||
outTimes[i] = times[idx]
|
||||
}
|
||||
outDatasets := make([][]float64, len(datasets))
|
||||
for d, ds := range datasets {
|
||||
if len(ds) != n {
|
||||
outDatasets[d] = ds
|
||||
continue
|
||||
}
|
||||
out := make([]float64, len(selected))
|
||||
for i, idx := range selected {
|
||||
out[i] = ds[idx]
|
||||
}
|
||||
outDatasets[d] = out
|
||||
}
|
||||
return outTimes, outDatasets
|
||||
}
|
||||
|
||||
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||
if !end.After(start) {
|
||||
return float64(left+right) / 2
|
||||
}
|
||||
if ts.Before(start) {
|
||||
ts = start
|
||||
}
|
||||
if ts.After(end) {
|
||||
ts = end
|
||||
}
|
||||
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
|
||||
return float64(left) + ratio*float64(right-left)
|
||||
}
|
||||
|
||||
func chartPointTime(times []time.Time, idx int) time.Time {
|
||||
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
|
||||
return times[idx].UTC()
|
||||
}
|
||||
if len(times) > 0 && !times[0].IsZero() {
|
||||
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
|
||||
}
|
||||
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
|
||||
}
|
||||
|
||||
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
|
||||
if scale.Max <= scale.Min {
|
||||
return float64(plotTop+plotBottom) / 2
|
||||
}
|
||||
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
|
||||
}
|
||||
|
||||
func chartSeriesBounds(values []float64) (float64, float64) {
|
||||
if len(values) == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
min, max := values[0], values[0]
|
||||
for _, value := range values[1:] {
|
||||
if value < min {
|
||||
min = value
|
||||
}
|
||||
if value > max {
|
||||
max = value
|
||||
}
|
||||
}
|
||||
if min == max {
|
||||
if max == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
pad := math.Abs(max) * 0.1
|
||||
if pad == 0 {
|
||||
pad = 1
|
||||
}
|
||||
min -= pad
|
||||
max += pad
|
||||
}
|
||||
if min > 0 {
|
||||
pad := (max - min) * 0.2
|
||||
if pad == 0 {
|
||||
pad = max * 0.1
|
||||
}
|
||||
min -= pad
|
||||
if min < 0 {
|
||||
min = 0
|
||||
}
|
||||
max += pad
|
||||
}
|
||||
return min, max
|
||||
}
|
||||
|
||||
func chartNiceTicks(min, max float64, target int) []float64 {
|
||||
if min == max {
|
||||
max = min + 1
|
||||
}
|
||||
span := max - min
|
||||
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
|
||||
for _, factor := range []float64{1, 2, 5, 10} {
|
||||
if span/(factor*step) <= float64(target)*1.5 {
|
||||
step = factor * step
|
||||
break
|
||||
}
|
||||
}
|
||||
low := math.Floor(min/step) * step
|
||||
high := math.Ceil(max/step) * step
|
||||
var ticks []float64
|
||||
for value := low; value <= high+step*0.001; value += step {
|
||||
ticks = append(ticks, math.Round(value*1e9)/1e9)
|
||||
}
|
||||
return ticks
|
||||
}
|
||||
|
||||
func valueClamp(value float64, scale chartScale) float64 {
|
||||
if value < scale.Min {
|
||||
return scale.Min
|
||||
}
|
||||
if value > scale.Max {
|
||||
return scale.Max
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func chartStatsLabel(datasets [][]float64) string {
|
||||
mn, avg, mx := globalStats(datasets)
|
||||
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf("min %s avg %s max %s",
|
||||
chartLegendNumber(mn),
|
||||
chartLegendNumber(avg),
|
||||
chartLegendNumber(mx),
|
||||
)
|
||||
}
|
||||
|
||||
func gpuDisplayLabel(idx int) string {
|
||||
if name := gpuModelNameByIndex(idx); name != "" {
|
||||
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||
}
|
||||
return fmt.Sprintf("GPU %d", idx)
|
||||
}
|
||||
|
||||
func gpuModelNameByIndex(idx int) string {
|
||||
now := time.Now()
|
||||
gpuLabelCache.mu.Lock()
|
||||
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||
gpuLabelCache.loadedAt = now
|
||||
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||
}
|
||||
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||
gpuLabelCache.mu.Unlock()
|
||||
return name
|
||||
}
|
||||
|
||||
func loadGPUModelNames() map[int]string {
|
||||
out := map[int]string{}
|
||||
gpus, err := platform.New().ListNvidiaGPUs()
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
for _, gpu := range gpus {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name != "" {
|
||||
out[gpu.Index] = name
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -1,6 +1,9 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -9,13 +12,33 @@ import (
|
||||
|
||||
// jobState holds the output lines and completion status of an async job.
|
||||
type jobState struct {
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logBuf *bufio.Writer
|
||||
}
|
||||
|
||||
// readTaskLogFile reads a task log, refusing files over 50 MB.
|
||||
func readTaskLogFile(path string) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > 50<<20 {
|
||||
return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
@@ -34,7 +57,10 @@ func (j *jobState) append(line string) {
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
appendJobLog(j.logPath, line)
|
||||
j.writeLogLineLocked(line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
select {
|
||||
@@ -44,6 +70,35 @@ func (j *jobState) append(line string) {
|
||||
}
|
||||
}
|
||||
|
||||
// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
|
||||
// Must be called with j.mu held. Uses a buffered writer kept open for the task
|
||||
// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
|
||||
func (j *jobState) writeLogLineLocked(line string) {
|
||||
if j.logFile == nil {
|
||||
f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
j.logFile = f
|
||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||
}
|
||||
_, _ = j.logBuf.WriteString(line + "\n")
|
||||
}
|
||||
|
||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||
func (j *jobState) closeLog() {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
if j.logBuf != nil {
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
if j.logFile != nil {
|
||||
_ = j.logFile.Close()
|
||||
j.logFile = nil
|
||||
j.logBuf = nil
|
||||
}
|
||||
}
|
||||
|
||||
func (j *jobState) finish(errMsg string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
@@ -84,12 +139,12 @@ func (m *jobManager) create(id string) *jobState {
|
||||
j := &jobState{}
|
||||
m.jobs[id] = j
|
||||
// Schedule cleanup after 30 minutes
|
||||
go func() {
|
||||
goRecoverOnce("job cleanup", func() {
|
||||
time.Sleep(30 * time.Minute)
|
||||
m.mu.Lock()
|
||||
delete(m.jobs, id)
|
||||
m.mu.Unlock()
|
||||
}()
|
||||
})
|
||||
return j
|
||||
}
|
||||
|
||||
@@ -107,12 +162,15 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
||||
return j, ok
|
||||
}
|
||||
|
||||
func newTaskJobState(logPath string) *jobState {
|
||||
func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||
j := &jobState{logPath: logPath}
|
||||
if len(serialPrefix) > 0 {
|
||||
j.serialPrefix = serialPrefix[0]
|
||||
}
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
data, err := os.ReadFile(logPath)
|
||||
data, err := readTaskLogFile(logPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return j
|
||||
}
|
||||
|
||||
242
audit/internal/webui/kmsg_watcher.go
Normal file
242
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,242 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||
// while any SAT task is running, and flushed when all tasks complete.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeCount int // number of in-flight SAT tasks
|
||||
window *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
targets []string // SAT targets running concurrently
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
for {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
time.Sleep(30 * time.Second)
|
||||
continue
|
||||
}
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
_ = f.Close()
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||
// if this is the first task starting.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if w.activeCount == 0 {
|
||||
w.window = &kmsgWindow{
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
w.activeCount++
|
||||
if w.window != nil {
|
||||
w.window.targets = append(w.window.targets, target)
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||
// it flushes the accumulated events to the status DB.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
w.activeCount--
|
||||
var window *kmsgWindow
|
||||
if w.activeCount <= 0 {
|
||||
w.activeCount = 0
|
||||
window = w.window
|
||||
w.window = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
137
audit/internal/webui/layout.go
Normal file
137
audit/internal/webui/layout.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func layoutHead(title string) string {
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>` + html.EscapeString(title) + `</title>
|
||||
<style>
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||
*{box-sizing:border-box;margin:0;padding:0}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||
a{color:var(--accent);text-decoration:none}
|
||||
/* Sidebar */
|
||||
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||
.nav{flex:1}
|
||||
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||
.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
|
||||
/* Content */
|
||||
.main{flex:1;display:flex;flex-direction:column;overflow:auto}
|
||||
.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
|
||||
.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
|
||||
.content{padding:24px;flex:1}
|
||||
/* Cards */
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
|
||||
.card-head-actions{justify-content:space-between}
|
||||
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
|
||||
.card-body{padding:16px}
|
||||
/* Buttons */
|
||||
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
|
||||
.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
|
||||
.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
|
||||
.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
|
||||
.btn-sm{padding:5px 10px;font-size:12px}
|
||||
/* Tables */
|
||||
table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
|
||||
th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
|
||||
td{padding:9px 14px;border-top:1px solid var(--border-lite)}
|
||||
tr:first-child td{border-top:0}
|
||||
tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
/* Status badges */
|
||||
.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
|
||||
.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Component chips — one small square per device */
|
||||
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
/* Forms */
|
||||
.form-row{margin-bottom:14px}
|
||||
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
||||
.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
|
||||
.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
|
||||
/* Grid */
|
||||
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
|
||||
/* iframe viewer */
|
||||
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
|
||||
/* Alerts */
|
||||
.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
|
||||
.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
|
||||
.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
`
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
items := []struct{ id, label, href, onclick string }{
|
||||
{"dashboard", "Dashboard", "/", ""},
|
||||
{"audit", "Audit", "/audit", ""},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||
if strings.TrimSpace(buildLabel) == "" {
|
||||
buildLabel = "dev"
|
||||
}
|
||||
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
gspMode := strings.TrimSpace(string(raw))
|
||||
switch gspMode {
|
||||
case "gsp-off":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||
case "gsp-stuck":
|
||||
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||
}
|
||||
}
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
cls := "nav-item"
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
if item.onclick != "" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||
cls, item.href, item.onclick, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
@@ -4,7 +4,11 @@ import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -18,8 +22,18 @@ type MetricsDB struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
func (m *MetricsDB) Close() error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
return m.db.Close()
|
||||
}
|
||||
|
||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -39,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
power_source TEXT,
|
||||
power_mode TEXT,
|
||||
power_reason TEXT,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
@@ -48,6 +65,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
clock_mhz REAL,
|
||||
mem_clock_mhz REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||
@@ -64,6 +83,47 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
|
||||
}
|
||||
|
||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||
rows, err := db.Query("PRAGMA table_info(" + table + ")")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var cid int
|
||||
var name, ctype string
|
||||
var notNull, pk int
|
||||
var dflt sql.NullString
|
||||
if err := rows.Scan(&cid, &name, &ctype, ¬Null, &dflt, &pk); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.EqualFold(name, column) {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -77,16 +137,16 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
|
||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -113,14 +173,81 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||
// the overall shape of every chart.
|
||||
//
|
||||
// Called hourly by the metrics collector background goroutine.
|
||||
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
start := deleteOlderThan.Unix()
|
||||
end := downsampleBefore.Unix()
|
||||
if end <= start {
|
||||
return nil
|
||||
}
|
||||
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
_, err := m.db.Exec(`
|
||||
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||
AND ts NOT IN (
|
||||
SELECT MIN(ts) FROM `+table+`
|
||||
WHERE ts >= ? AND ts < ?
|
||||
GROUP BY ts / 60
|
||||
)`, start, end, start, end)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||
func (m *MetricsDB) Prune(before time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
cutTS := before.Unix()
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// LoadBetween returns samples in chronological order within the given time window.
|
||||
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
if m == nil {
|
||||
return nil, nil
|
||||
}
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil, nil
|
||||
}
|
||||
if end.Before(start) {
|
||||
start, end = end, start
|
||||
}
|
||||
return m.loadSamples(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
start.Unix(), end.Unix(),
|
||||
)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
@@ -132,13 +259,16 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
defer rows.Close()
|
||||
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
powerSource string
|
||||
powerMode string
|
||||
powerReason string
|
||||
}
|
||||
var sysRows []sysRow
|
||||
for rows.Next() {
|
||||
var r sysRow
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
|
||||
continue
|
||||
}
|
||||
sysRows = append(sysRows, r)
|
||||
@@ -146,20 +276,18 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
if len(sysRows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
// Reverse to chronological order
|
||||
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
||||
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
||||
}
|
||||
|
||||
// Collect min/max ts for range query
|
||||
minTS := sysRows[0].ts
|
||||
maxTS := sysRows[len(sysRows)-1].ts
|
||||
|
||||
// Load GPU rows in range
|
||||
type gpuKey struct{ ts int64; idx int }
|
||||
type gpuKey struct {
|
||||
ts int64
|
||||
idx int
|
||||
}
|
||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||
gRows, err := m.db.Query(
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
@@ -167,14 +295,17 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
for gRows.Next() {
|
||||
var ts int64
|
||||
var g platform.GPUMetricRow
|
||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
|
||||
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load fan rows in range
|
||||
type fanKey struct{ ts int64; name string }
|
||||
type fanKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
fanData := map[fanKey]float64{}
|
||||
fRows, err := m.db.Query(
|
||||
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
@@ -192,7 +323,10 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
|
||||
// Load temp rows in range
|
||||
type tempKey struct{ ts int64; name string }
|
||||
type tempKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
tempData := map[tempKey]platform.TempReading{}
|
||||
tRows, err := m.db.Query(
|
||||
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
@@ -208,7 +342,9 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||
// Sort each list so that sample reconstruction is deterministic regardless
|
||||
// of Go's non-deterministic map iteration order.
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
@@ -217,6 +353,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
sort.Ints(gpuIndices)
|
||||
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
@@ -225,6 +363,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(fanNames)
|
||||
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
@@ -233,14 +373,18 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(tempNames)
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
s := platform.LiveMetricSample{
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
PowerSource: r.powerSource,
|
||||
PowerMode: r.powerMode,
|
||||
PowerReason: r.powerReason,
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||
@@ -266,7 +410,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
rows, err := m.db.Query(`
|
||||
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
|
||||
g.clock_mhz, g.mem_clock_mhz
|
||||
FROM sys_metrics s
|
||||
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||
ORDER BY s.ts, g.gpu_index
|
||||
@@ -277,13 +422,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
defer rows.Close()
|
||||
|
||||
cw := csv.NewWriter(w)
|
||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
|
||||
for rows.Next() {
|
||||
var ts int64
|
||||
var cpu, mem, pwr float64
|
||||
var gpuIdx sql.NullInt64
|
||||
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
||||
var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
|
||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
|
||||
continue
|
||||
}
|
||||
row := []string{
|
||||
@@ -299,9 +444,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
|
||||
)
|
||||
} else {
|
||||
row = append(row, "", "", "", "", "")
|
||||
row = append(row, "", "", "", "", "", "", "")
|
||||
}
|
||||
_ = cw.Write(row)
|
||||
}
|
||||
@@ -309,9 +456,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
return cw.Error()
|
||||
}
|
||||
|
||||
// Close closes the database.
|
||||
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
||||
|
||||
func nullFloat(v float64) sql.NullFloat64 {
|
||||
return sql.NullFloat64{Float64: v, Valid: true}
|
||||
}
|
||||
|
||||
174
audit/internal/webui/metricsdb_test.go
Normal file
174
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 3; i++ {
|
||||
err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||
CPULoadPct: float64(10 + i),
|
||||
MemLoadPct: float64(20 + i),
|
||||
PowerW: float64(300 + i),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
all, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(all) != 3 {
|
||||
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||
}
|
||||
for i, sample := range all {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||
}
|
||||
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||
}
|
||||
}
|
||||
|
||||
recent, err := db.LoadRecent(2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadRecent: %v", err)
|
||||
}
|
||||
if len(recent) != 2 {
|
||||
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||
}
|
||||
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||
}
|
||||
for i, sample := range recent {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "metrics.db")
|
||||
raw, err := sql.Open("sqlite", path)
|
||||
if err != nil {
|
||||
t.Fatalf("sql.Open: %v", err)
|
||||
}
|
||||
_, err = raw.Exec(`
|
||||
CREATE TABLE gpu_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
gpu_index INTEGER NOT NULL,
|
||||
temp_c REAL,
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE sys_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE fan_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
rpm REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
CREATE TABLE temp_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
grp TEXT NOT NULL,
|
||||
celsius REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
t.Fatalf("create legacy schema: %v", err)
|
||||
}
|
||||
_ = raw.Close()
|
||||
|
||||
db, err := openMetricsDB(path)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
now := time.Unix(1_700_000_100, 0).UTC()
|
||||
err = db.Write(platform.LiveMetricSample{
|
||||
Timestamp: now,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
|
||||
samples, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
|
||||
t.Fatalf("samples=%+v", samples)
|
||||
}
|
||||
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
|
||||
t.Fatalf("ClockMHz=%v want 1410", got)
|
||||
}
|
||||
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
|
||||
t.Fatalf("MemClockMHz=%v want 2600", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 5; i++ {
|
||||
if err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Minute),
|
||||
CPULoadPct: float64(i),
|
||||
}); err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
|
||||
if err != nil {
|
||||
t.Fatalf("LoadBetween: %v", err)
|
||||
}
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("LoadBetween len=%d want 3", len(got))
|
||||
}
|
||||
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
|
||||
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
|
||||
}
|
||||
}
|
||||
613
audit/internal/webui/page_benchmark.go
Normal file
613
audit/internal/webui/page_benchmark.go
Normal file
@@ -0,0 +1,613 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type benchmarkHistoryRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
gpuScores map[int]float64
|
||||
gpuStatuses map[int]string
|
||||
overallStatus string
|
||||
}
|
||||
|
||||
func renderBenchmark(opts HandlerOptions) string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="grid2">
|
||||
<div class="card">
|
||||
<div class="card-head">Benchmark Setup</div>
|
||||
<div class="card-body">
|
||||
<div class="form-row">
|
||||
<label>Profile</label>
|
||||
<select id="benchmark-profile">
|
||||
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-row">
|
||||
<label>GPU Selection</label>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
</div>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Sequential — one GPU at a time</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-parallel-label">
|
||||
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row" id="benchmark-ramp-label">
|
||||
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
|
||||
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||||
</label>
|
||||
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||||
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||||
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||||
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
|
||||
</div>
|
||||
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||||
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Method Split</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||
<table>
|
||||
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||
</table>
|
||||
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||
|
||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||||
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let benchmarkES = null;
|
||||
function benchmarkTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function benchmarkSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function benchmarkMode() {
|
||||
const el = document.querySelector('input[name="benchmark-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function benchmarkUpdateSelectionNote() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||||
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||||
const note = document.getElementById('benchmark-selection-note');
|
||||
if (!selected.length) {
|
||||
perfBtn.disabled = true;
|
||||
fitBtn.disabled = true;
|
||||
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||
return;
|
||||
}
|
||||
perfBtn.disabled = false;
|
||||
fitBtn.disabled = false;
|
||||
const mode = benchmarkMode();
|
||||
if (mode === 'ramp-up') {
|
||||
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
|
||||
} else if (mode === 'parallel') {
|
||||
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||||
} else {
|
||||
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||||
}
|
||||
}
|
||||
function benchmarkRenderGPUList(gpus) {
|
||||
const root = document.getElementById('benchmark-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="benchmark-gpu-row">'
|
||||
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
benchmarkApplyMultiGPUState(gpus.length);
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkLoadGPUs() {
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
status.textContent = '';
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
benchmarkRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
benchmarkUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function benchmarkSelectAll() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function benchmarkSelectNone() {
|
||||
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
benchmarkUpdateSelectionNote();
|
||||
}
|
||||
function runNvidiaBenchmark(kind) {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
if (!selected.length) {
|
||||
status.textContent = 'Select at least one GPU.';
|
||||
return;
|
||||
}
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const mode = benchmarkMode();
|
||||
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||||
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||||
if (kind === 'power-fit' && mode === 'parallel') {
|
||||
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||||
return;
|
||||
}
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: kind === 'performance' && selected.length > 1,
|
||||
parallel_gpus: parallelGPUs,
|
||||
ramp_up: rampUp,
|
||||
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||||
status.textContent = 'Queueing...';
|
||||
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||||
fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||
return;
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
if (e.data) failures += 1;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
const isLast = (idx + 1 >= taskIds.length);
|
||||
streamNext(idx + 1, failures);
|
||||
if (isLast) { benchmarkRefreshResults(); }
|
||||
});
|
||||
benchmarkES.onerror = function() {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
streamNext(idx + 1, failures + 1);
|
||||
};
|
||||
};
|
||||
streamNext(0, 0);
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
function benchmarkRenderAutotuneStatus(payload) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (!el) return;
|
||||
if (!payload || !payload.configured || !payload.config) {
|
||||
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
|
||||
return;
|
||||
}
|
||||
const cfg = payload.config || {};
|
||||
const decision = payload.decision || {};
|
||||
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
|
||||
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
|
||||
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
|
||||
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
|
||||
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
|
||||
}
|
||||
function loadBenchmarkAutotuneStatus() {
|
||||
fetch('/api/bee-bench/nvidia/autotune/status')
|
||||
.then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
})
|
||||
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
|
||||
.catch(function(err) {
|
||||
const el = document.getElementById('benchmark-autotune-status');
|
||||
if (el) el.textContent = 'Autotune status error: ' + err.message;
|
||||
});
|
||||
}
|
||||
function runBenchmarkAutotune() {
|
||||
const selected = benchmarkSelectedGPUIndices();
|
||||
const status = document.getElementById('benchmark-run-status');
|
||||
const term = document.getElementById('benchmark-terminal');
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
|
||||
term.textContent = 'Enqueuing benchmark autotune...\n';
|
||||
status.textContent = 'Queueing autotune...';
|
||||
fetch('/api/bee-bench/nvidia/autotune/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify({
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
|
||||
gpu_indices: selected
|
||||
})
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
}).then(function(d) {
|
||||
const taskIds = benchmarkTaskIDs(d);
|
||||
if (!taskIds.length) throw new Error('No autotune task was queued.');
|
||||
const taskId = taskIds[0];
|
||||
status.textContent = 'Autotune queued: ' + taskId;
|
||||
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
benchmarkES.addEventListener('done', function(e) {
|
||||
if (benchmarkES) {
|
||||
benchmarkES.close();
|
||||
benchmarkES = null;
|
||||
}
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
|
||||
loadBenchmarkAutotuneStatus();
|
||||
});
|
||||
}).catch(function(err) {
|
||||
status.textContent = 'Autotune error.';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
});
|
||||
}
|
||||
benchmarkLoadGPUs();
|
||||
loadBenchmarkAutotuneStatus();
|
||||
function benchmarkRefreshResults() {
|
||||
fetch('/api/benchmark/results')
|
||||
.then(function(r) { return r.text(); })
|
||||
.then(function(html) {
|
||||
const el = document.getElementById('benchmark-results-section');
|
||||
if (el) el.innerHTML = html;
|
||||
})
|
||||
.catch(function() {});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCard(exportDir string) string {
|
||||
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||
perf := renderBenchmarkResultsCardFromRuns(
|
||||
"Perf Results",
|
||||
"Composite score by saved benchmark run and GPU.",
|
||||
"No saved performance benchmark runs yet.",
|
||||
maxIdx,
|
||||
runs,
|
||||
)
|
||||
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||
return perf + "\n" + power
|
||||
}
|
||||
|
||||
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||
if len(runs) == 0 {
|
||||
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<div style="overflow-x:auto">`)
|
||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||
for i := 0; i <= maxGPUIndex; i++ {
|
||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||
}
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
overallColor := "var(--ok)"
|
||||
overallLabel := run.overallStatus
|
||||
if overallLabel == "" {
|
||||
overallLabel = "OK"
|
||||
}
|
||||
if overallLabel == "FAILED" {
|
||||
overallColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if overallLabel != "OK" {
|
||||
overallColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||
score, ok := run.gpuScores[idx]
|
||||
if !ok {
|
||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||
continue
|
||||
}
|
||||
gpuStatus := run.gpuStatuses[idx]
|
||||
scoreColor := ""
|
||||
switch gpuStatus {
|
||||
case "FAILED":
|
||||
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||
case "WARNING", "PARTIAL":
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
case "", "OK":
|
||||
default:
|
||||
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||
}
|
||||
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||
}
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||||
baseDir := app.DefaultBeeBenchPerfDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return -1, nil
|
||||
}
|
||||
sort.Strings(paths)
|
||||
return loadBenchmarkHistoryFromPaths(paths)
|
||||
}
|
||||
|
||||
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
|
||||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||
maxGPUIndex := -1
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var result platform.NvidiaBenchmarkResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
continue
|
||||
}
|
||||
run := benchmarkHistoryRun{
|
||||
generatedAt: result.GeneratedAt,
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
gpuScores: make(map[int]float64),
|
||||
gpuStatuses: make(map[int]string),
|
||||
overallStatus: result.OverallStatus,
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||
if gpu.Index > maxGPUIndex {
|
||||
maxGPUIndex = gpu.Index
|
||||
}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
return maxGPUIndex, runs
|
||||
}
|
||||
|
||||
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
baseDir := app.DefaultBeeBenchPowerDir
|
||||
if strings.TrimSpace(exportDir) != "" {
|
||||
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||
}
|
||||
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||
if err != nil || len(paths) == 0 {
|
||||
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
type powerRun struct {
|
||||
generatedAt time.Time
|
||||
displayTime string
|
||||
result platform.NvidiaPowerBenchResult
|
||||
}
|
||||
var runs []powerRun
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var r platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &r); err != nil {
|
||||
continue
|
||||
}
|
||||
runs = append(runs, powerRun{
|
||||
generatedAt: r.GeneratedAt,
|
||||
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
result: r,
|
||||
})
|
||||
}
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
})
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||
|
||||
latest := runs[0].result
|
||||
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||
if latest.Hostname != "" {
|
||||
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||
}
|
||||
if latest.OverallStatus != "" {
|
||||
statusColor := "var(--ok)"
|
||||
if latest.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||
}
|
||||
b.WriteString(`</p>`)
|
||||
|
||||
if len(latest.GPUs) > 0 {
|
||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||
b.WriteString(`</tr></thead><tbody>`)
|
||||
for _, gpu := range latest.GPUs {
|
||||
finalLimitW := gpu.StablePowerLimitW
|
||||
if finalLimitW <= 0 {
|
||||
finalLimitW = gpu.AppliedPowerLimitW
|
||||
}
|
||||
derated := gpu.Derated ||
|
||||
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||
rowStyle := ""
|
||||
finalStyle := ""
|
||||
if derated {
|
||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||
}
|
||||
statusLabel := gpu.Status
|
||||
if statusLabel == "" {
|
||||
statusLabel = "OK"
|
||||
}
|
||||
statusColor := "var(--ok)"
|
||||
if statusLabel == "FAILED" {
|
||||
statusColor = "var(--crit-fg,#9f3a38)"
|
||||
} else if statusLabel != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
nominalStr := "-"
|
||||
if gpu.DefaultPowerLimitW > 0 {
|
||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||
}
|
||||
singleStr := "-"
|
||||
if gpu.AppliedPowerLimitW > 0 {
|
||||
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||
}
|
||||
multiStr := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||
}
|
||||
p95Str := "-"
|
||||
if gpu.MaxObservedPowerW > 0 {
|
||||
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||
}
|
||||
b.WriteString(`<tr` + rowStyle + `>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div>`)
|
||||
}
|
||||
|
||||
if len(runs) > 1 {
|
||||
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||
for i, run := range runs {
|
||||
statusColor := "var(--ok)"
|
||||
if run.result.OverallStatus != "OK" {
|
||||
statusColor = "var(--warn)"
|
||||
}
|
||||
b.WriteString(`<tr>`)
|
||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||
b.WriteString(`</tr>`)
|
||||
}
|
||||
b.WriteString(`</tbody></table></div></details>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
383
audit/internal/webui/page_burn.go
Normal file
383
audit/internal/webui/page_burn.go
Normal file
@@ -0,0 +1,383 @@
|
||||
package webui
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Burn Profile</div>
|
||||
<div class="card-body burn-profile-body">
|
||||
<div class="burn-profile-col">
|
||||
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
<div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
|
||||
<label class="cb-row">
|
||||
<input type="radio" name="burn-nvidia-mode" value="sequential" checked>
|
||||
<span>Sequential — selected GPUs one at a time</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-parallel-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="parallel">
|
||||
<span>Parallel — all selected GPUs simultaneously</span>
|
||||
</label>
|
||||
<label class="cb-row" id="burn-ramp-label">
|
||||
<input type="radio" name="burn-nvidia-mode" value="ramp-up">
|
||||
<span>Ramp-up — add one GPU at a time</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">Core Burn Paths</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||
.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.burn-profile-col { min-width:0; }
|
||||
.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
|
||||
.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
|
||||
.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
|
||||
.burn-grid { align-items:stretch; }
|
||||
.burn-card { height:100%; display:flex; flex-direction:column; }
|
||||
.burn-card-body { flex:1; display:flex; flex-direction:column; }
|
||||
.card-head-actions { justify-content:space-between; }
|
||||
.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
|
||||
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let biES = null;
|
||||
function burnTaskIDs(payload) {
|
||||
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||
if (payload && payload.task_id) return [payload.task_id];
|
||||
return [];
|
||||
}
|
||||
function burnProfile() {
|
||||
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||
return selected ? selected.value : 'smoke';
|
||||
}
|
||||
function burnSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function burnNvidiaMode() {
|
||||
const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
|
||||
return el ? el.value : 'sequential';
|
||||
}
|
||||
function burnApplyMultiGPUState(gpuCount) {
|
||||
var multiValues = ['parallel', 'ramp-up'];
|
||||
var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
|
||||
radios.forEach(function(el) {
|
||||
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||
if (gpuCount < 2 && isMulti) {
|
||||
el.disabled = true;
|
||||
if (el.checked) {
|
||||
var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
|
||||
if (seq) seq.checked = true;
|
||||
}
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '0.4';
|
||||
} else {
|
||||
el.disabled = false;
|
||||
var label = el.closest('label');
|
||||
if (label) label.style.opacity = '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function burnUpdateSelectionNote() {
|
||||
const note = document.getElementById('burn-selection-note');
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
|
||||
}
|
||||
function burnRenderGPUList(gpus) {
|
||||
const root = document.getElementById('burn-gpu-list');
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
burnUpdateSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="burn-gpu-row">'
|
||||
+ '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
burnApplyMultiGPUState(gpus.length);
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectAll() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnSelectNone() {
|
||||
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||
burnUpdateSelectionNote();
|
||||
}
|
||||
function burnLoadGPUs() {
|
||||
fetch('/api/gpu/nvidia').then(function(r) {
|
||||
return r.json().then(function(body) {
|
||||
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||
return body;
|
||||
});
|
||||
}).then(function(gpus) {
|
||||
burnRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
burnUpdateSelectionNote();
|
||||
});
|
||||
}
|
||||
function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||
const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
|
||||
if (useSelectedNvidia) {
|
||||
const selected = burnSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
body.gpu_indices = selected;
|
||||
const bMode = burnNvidiaMode();
|
||||
if (bMode === 'ramp-up' && selected.length > 1) {
|
||||
body.stagger_gpu_start = true;
|
||||
} else if (bMode === 'parallel' && selected.length > 1) {
|
||||
body.parallel_gpus = true;
|
||||
}
|
||||
}
|
||||
return fetch('/api/sat/' + target + '/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(body)
|
||||
}).then(function(r) {
|
||||
return r.json().then(function(payload) {
|
||||
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||
return payload;
|
||||
});
|
||||
});
|
||||
}
|
||||
function streamTask(taskId, label) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
});
|
||||
}
|
||||
function streamBurnTask(taskId, label, resetTerminal) {
|
||||
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||
}
|
||||
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||
term.textContent += 'ERROR: no tasks queued.\n';
|
||||
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||
}
|
||||
const streamNext = function(idx, failures) {
|
||||
if (idx >= taskIds.length) {
|
||||
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||
}
|
||||
const taskId = taskIds[idx];
|
||||
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||
return new Promise(function(resolve) {
|
||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', function(e) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + (e.data ? 1 : 0));
|
||||
});
|
||||
biES.onerror = function() {
|
||||
if (biES) {
|
||||
biES.close();
|
||||
biES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve(failures + 1);
|
||||
};
|
||||
}).then(function(nextFailures) {
|
||||
return streamNext(idx + 1, nextFailures);
|
||||
});
|
||||
};
|
||||
return streamNext(0, 0);
|
||||
}
|
||||
function runBurnTaskSet(tasks, statusElId) {
|
||||
const enabled = tasks.filter(function(t) {
|
||||
const el = document.getElementById(t.id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
const status = statusElId ? document.getElementById(statusElId) : null;
|
||||
if (status) status.textContent = '';
|
||||
if (!enabled.length) {
|
||||
if (status) status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const term = document.getElementById('bi-terminal');
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||
term.textContent = '';
|
||||
const runNext = function(idx) {
|
||||
if (idx >= enabled.length) {
|
||||
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||
return Promise.resolve();
|
||||
}
|
||||
const t = enabled[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||
.then(function(d) {
|
||||
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||
})
|
||||
.then(function() {
|
||||
return runNext(idx + 1);
|
||||
})
|
||||
.catch(function(err) {
|
||||
if (status) status.textContent = 'Error: ' + err.message;
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||
return Promise.reject(err);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runPlatformStress() {
|
||||
const comps = [];
|
||||
const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
|
||||
const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
|
||||
const hasChecked = function(ids) {
|
||||
return ids.some(function(id) {
|
||||
const el = document.getElementById(id);
|
||||
return el && el.checked && !el.disabled;
|
||||
});
|
||||
};
|
||||
if (hasChecked(computeIDs)) comps.push('cpu');
|
||||
if (hasChecked(gpuIDs)) comps.push('gpu');
|
||||
if (!comps.length) {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
|
||||
return;
|
||||
}
|
||||
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
|
||||
streamTask(d.task_id, 'Platform Thermal Cycling');
|
||||
});
|
||||
}
|
||||
function runAllBurnTasks() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
const all = [
|
||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||
{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
|
||||
{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
|
||||
{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
|
||||
];
|
||||
status.textContent = 'Enqueuing...';
|
||||
runBurnTaskSet(all, 'burn-all-status');
|
||||
}
|
||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||
const map = {
|
||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||
};
|
||||
tools.forEach(function(t) {
|
||||
const spec = map[t.id];
|
||||
if (!spec) return;
|
||||
const cb = document.getElementById(spec.cb);
|
||||
const note = document.getElementById(spec.note);
|
||||
if (!cb) return;
|
||||
if (t.available) {
|
||||
cb.disabled = false;
|
||||
} else if (note) {
|
||||
note.textContent = '— ' + spec.reason;
|
||||
}
|
||||
});
|
||||
}).catch(function() {});
|
||||
burnLoadGPUs();
|
||||
</script>`
|
||||
}
|
||||
434
audit/internal/webui/page_export_tools.go
Normal file
434
audit/internal/webui/page_export_tools.go
Normal file
@@ -0,0 +1,434 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"html"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func renderExport(exportDir string) string {
|
||||
entries, _ := listExportFiles(exportDir)
|
||||
var rows strings.Builder
|
||||
for _, e := range entries {
|
||||
rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
|
||||
url.QueryEscape(e), html.EscapeString(e)))
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
|
||||
}
|
||||
return `<div class="grid2">
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||
</div></div>
|
||||
</div>
|
||||
|
||||
` + renderUSBExportCard()
|
||||
}
|
||||
|
||||
func listExportFiles(exportDir string) ([]string, error) {
|
||||
var entries []string
|
||||
err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
rel, err := filepath.Rel(exportDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries = append(entries, rel)
|
||||
return nil
|
||||
})
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
sort.Strings(entries)
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func renderSupportBundleInline() string {
|
||||
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||
<script>
|
||||
window.supportBundleDownload = function() {
|
||||
var btn = document.getElementById('support-bundle-btn');
|
||||
var status = document.getElementById('support-bundle-status');
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Building...';
|
||||
status.textContent = 'Collecting logs and export data\u2026';
|
||||
status.style.color = 'var(--muted)';
|
||||
var filename = 'bee-support.tar.gz';
|
||||
fetch('/export/support.tar.gz')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||
var cd = r.headers.get('Content-Disposition') || '';
|
||||
var m = cd.match(/filename="?([^";]+)"?/);
|
||||
if (m) filename = m[1];
|
||||
return r.blob();
|
||||
})
|
||||
.then(function(blob) {
|
||||
var url = URL.createObjectURL(blob);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
status.textContent = 'Download started.';
|
||||
status.style.color = 'var(--ok-fg)';
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
})
|
||||
.finally(function() {
|
||||
btn.disabled = false;
|
||||
btn.textContent = '\u2195 Download Support Bundle';
|
||||
});
|
||||
};
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderUSBExportCard() string {
|
||||
return `<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">Export to USB
|
||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||
</div>`
|
||||
}
|
||||
|
||||
func renderUSBExportInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function usbRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||
window._usbTargets = Array.isArray(targets) ? targets : [];
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
return;
|
||||
}
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||
targets.map((t, idx) => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
|
||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
}).catch(e => {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.usbExport = function(type, targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
if (!target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: USB target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Exporting...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
||||
fetch('/api/export/usb/'+type, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
});
|
||||
};
|
||||
window.usbRefresh = usbRefresh;
|
||||
usbRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNvidiaSelfHealInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function nvidiaSelfHealShowResult(label, status, output) {
|
||||
var out = document.getElementById('nvidia-self-heal-out');
|
||||
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
term.textContent = output || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
}
|
||||
function nvidiaRestartDrivers() {
|
||||
var btn = document.getElementById('nvidia-restart-btn');
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Restarting...';
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||
fetch('/api/services/action', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||
setTimeout(function() {
|
||||
loadServices();
|
||||
loadNvidiaSelfHeal();
|
||||
}, 800);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function nvidiaResetGPU(index, btn) {
|
||||
var original = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Resetting...';
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||
fetch('/api/gpu/nvidia-reset', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({index:index})
|
||||
}).then(r=>r.json()).then(d => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||
}).catch(e => {
|
||||
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||
}).finally(() => {
|
||||
btn.disabled = false;
|
||||
btn.textContent = original;
|
||||
});
|
||||
}
|
||||
function loadNvidiaSelfHeal() {
|
||||
var status = document.getElementById('nvidia-self-heal-status');
|
||||
var table = document.getElementById('nvidia-self-heal-table');
|
||||
status.textContent = 'Loading NVIDIA GPU status...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||
const rows = gpus.map(g => {
|
||||
const serial = g.serial || '';
|
||||
const bdf = g.bdf || '';
|
||||
const id = serial || bdf || ('gpu-' + g.index);
|
||||
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||
const details = [];
|
||||
if (serial) details.push('serial ' + serial);
|
||||
if (bdf) details.push('bdf ' + bdf);
|
||||
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||
return '<tr>'
|
||||
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||
+ '</td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(e => {
|
||||
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
loadNvidiaSelfHeal();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderTools() string {
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.message || 'Checking...';
|
||||
if (d.status === 'ok' || d.in_ram) {
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else if (d.status === 'failed') {
|
||||
txt.style.color = 'var(--err, #b91c1c)';
|
||||
} else {
|
||||
txt.style.color = 'var(--muted)';
|
||||
}
|
||||
if (d.can_start_task) {
|
||||
btn.style.display = '';
|
||||
btn.disabled = false;
|
||||
} else {
|
||||
btn.style.display = 'none';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML =
|
||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
checkTools();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderExportIndex(exportDir string) (string, error) {
|
||||
entries, err := listExportFiles(exportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
|
||||
body.WriteString(`<h1>Bee Export Files</h1><ul>`)
|
||||
for _, entry := range entries {
|
||||
body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
|
||||
}
|
||||
if len(entries) == 0 {
|
||||
body.WriteString(`<li>No export files found.</li>`)
|
||||
}
|
||||
body.WriteString(`</ul></body></html>`)
|
||||
return body.String(), nil
|
||||
}
|
||||
314
audit/internal/webui/page_install_tasks.go
Normal file
314
audit/internal/webui/page_install_tasks.go
Normal file
@@ -0,0 +1,314 @@
|
||||
package webui
|
||||
|
||||
func renderInstallInline() string {
|
||||
return `
|
||||
<div class="alert alert-warn" style="margin-bottom:16px">
|
||||
<strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
|
||||
disk and write the live system onto it. All existing data on the target disk will be lost.
|
||||
This operation cannot be undone.
|
||||
</div>
|
||||
<div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
|
||||
<div id="install-disk-section" style="display:none">
|
||||
<div class="card" style="margin-bottom:0">
|
||||
<table id="install-disk-table">
|
||||
<thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
|
||||
<tbody id="install-disk-tbody"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div style="margin-top:12px">
|
||||
<button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
|
||||
</div>
|
||||
</div>
|
||||
<div id="install-confirm-section" style="display:none;margin-top:20px">
|
||||
<div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
|
||||
<div class="form-row" style="max-width:360px">
|
||||
<label>Type the device name to confirm (e.g. /dev/sda)</label>
|
||||
<input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
|
||||
</div>
|
||||
<button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
|
||||
<button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
|
||||
</div>
|
||||
<div id="install-progress-section" style="display:none;margin-top:20px">
|
||||
<div class="card-head" style="margin-bottom:8px">Installation Progress</div>
|
||||
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
||||
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
#install-disk-tbody tr{cursor:pointer}
|
||||
#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
|
||||
#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
|
||||
</style>
|
||||
|
||||
<script>
|
||||
var _installSelected = null;
|
||||
|
||||
function installRefreshDisks() {
|
||||
document.getElementById('install-loading').style.display = '';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
_installSelected = null;
|
||||
fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var tbody = document.getElementById('install-disk-tbody');
|
||||
tbody.innerHTML = '';
|
||||
if (!disks || disks.length === 0) {
|
||||
tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
|
||||
} else {
|
||||
disks.forEach(function(d) {
|
||||
var warnings = (d.warnings || []);
|
||||
var statusHtml;
|
||||
if (warnings.length === 0) {
|
||||
statusHtml = '<span class="badge badge-ok">OK</span>';
|
||||
} else {
|
||||
var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
|
||||
statusHtml = warnings.map(function(w){
|
||||
var cls = hasSmall ? 'badge-err' : 'badge-warn';
|
||||
return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'"') + '">' +
|
||||
(w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
|
||||
}).join(' ');
|
||||
}
|
||||
var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
|
||||
? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
|
||||
var tr = document.createElement('tr');
|
||||
tr.dataset.device = d.device;
|
||||
tr.dataset.model = d.model || 'Unknown';
|
||||
tr.dataset.size = d.size;
|
||||
tr.dataset.warnings = JSON.stringify(warnings);
|
||||
tr.innerHTML =
|
||||
'<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
|
||||
'<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
|
||||
'<td>' + (d.model || '—') + '</td>' +
|
||||
'<td>' + d.size + '</td>' +
|
||||
'<td>' + statusHtml + '</td>';
|
||||
tr.addEventListener('click', function(){ installSelectDisk(this); });
|
||||
tbody.appendChild(tr);
|
||||
});
|
||||
}
|
||||
document.getElementById('install-disk-section').style.display = '';
|
||||
}).catch(function(e){
|
||||
document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
|
||||
});
|
||||
}
|
||||
|
||||
function installSelectDisk(tr) {
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
tr.classList.add('selected');
|
||||
var radio = tr.querySelector('input[type=radio]');
|
||||
if (radio) radio.checked = true;
|
||||
_installSelected = {
|
||||
device: tr.dataset.device,
|
||||
model: tr.dataset.model,
|
||||
size: tr.dataset.size,
|
||||
warnings: JSON.parse(tr.dataset.warnings || '[]')
|
||||
};
|
||||
var warnBox = document.getElementById('install-confirm-warn');
|
||||
var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
|
||||
' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
|
||||
' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
|
||||
if (_installSelected.warnings.length > 0) {
|
||||
warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
|
||||
}
|
||||
warnBox.innerHTML = warnLines;
|
||||
document.getElementById('install-confirm-input').value = '';
|
||||
document.getElementById('install-start-btn').disabled = true;
|
||||
document.getElementById('install-confirm-section').style.display = '';
|
||||
document.getElementById('install-progress-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installDeselect() {
|
||||
_installSelected = null;
|
||||
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||
document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
}
|
||||
|
||||
function installCheckConfirm() {
|
||||
var val = document.getElementById('install-confirm-input').value.trim();
|
||||
var ok = _installSelected && val === _installSelected.device;
|
||||
document.getElementById('install-start-btn').disabled = !ok;
|
||||
}
|
||||
|
||||
function installStart() {
|
||||
if (!_installSelected) return;
|
||||
document.getElementById('install-confirm-section').style.display = 'none';
|
||||
document.getElementById('install-disk-section').style.display = 'none';
|
||||
document.getElementById('install-loading').style.display = 'none';
|
||||
var prog = document.getElementById('install-progress-section');
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
prog.style.display = '';
|
||||
term.textContent = '';
|
||||
status.textContent = 'Starting installation…';
|
||||
status.style.color = 'var(--muted)';
|
||||
|
||||
fetch('/api/install/run', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({device: _installSelected.device})
|
||||
}).then(function(r){
|
||||
return r.json().then(function(j){
|
||||
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||
return j;
|
||||
});
|
||||
}).then(function(j){
|
||||
if (!j.task_id) throw new Error('missing task id');
|
||||
installStreamLog(j.task_id);
|
||||
}).catch(function(e){
|
||||
status.textContent = 'Error: ' + e;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function installStreamLog(taskId) {
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
};
|
||||
es.addEventListener('done', function(e) {
|
||||
es.close();
|
||||
if (!e.data) {
|
||||
status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
|
||||
var rebootBtn = document.createElement('button');
|
||||
rebootBtn.className = 'btn btn-primary btn-sm';
|
||||
rebootBtn.style.marginLeft = '12px';
|
||||
rebootBtn.textContent = 'Reboot now';
|
||||
rebootBtn.onclick = function(){
|
||||
fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({name:'', action:'reboot'})});
|
||||
};
|
||||
status.appendChild(rebootBtn);
|
||||
} else {
|
||||
status.textContent = '✗ Installation failed: ' + e.data;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
}
|
||||
});
|
||||
es.onerror = function() {
|
||||
es.close();
|
||||
status.textContent = '✗ Stream disconnected.';
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
};
|
||||
}
|
||||
|
||||
installRefreshDisks();
|
||||
</script>
|
||||
`
|
||||
}
|
||||
|
||||
func renderInstall() string {
|
||||
return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
|
||||
renderInstallInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskRefreshTimer = null;
|
||||
var _tasksAll = [];
|
||||
var _taskPage = 1;
|
||||
var _taskPageSize = 50;
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||
if (_tasksAll.length === 0) {
|
||||
_taskPage = 1;
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
return;
|
||||
}
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||
if (_taskPage < 1) _taskPage = 1;
|
||||
const start = (_taskPage - 1) * _taskPageSize;
|
||||
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||
const rows = pageTasks.map(t => {
|
||||
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
|
||||
if (t.status === 'running' || t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
|
||||
}
|
||||
if (t.status === 'pending') {
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">⇧</button>';
|
||||
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">⇩</button>';
|
||||
}
|
||||
return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
|
||||
'<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
const showingFrom = start + 1;
|
||||
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||
const pager =
|
||||
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||
'</div>' +
|
||||
'</div>';
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||
});
|
||||
}
|
||||
|
||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||
function formatDurSec(sec) {
|
||||
sec = Math.max(0, Math.round(sec||0));
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
}
|
||||
function setTaskPage(page) {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||
loadTasks();
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
loadTasks();
|
||||
var toast = document.getElementById('kill-toast');
|
||||
var parts = [];
|
||||
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||
toast.style.display = '';
|
||||
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||
});
|
||||
}
|
||||
function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
}
|
||||
238
audit/internal/webui/page_metrics.go
Normal file
238
audit/internal/webui/page_metrics.go
Normal file
@@ -0,0 +1,238 @@
|
||||
package webui
|
||||
|
||||
func renderMetrics() string {
|
||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — CPU</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Temperature — Ambient Sensors</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Server — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
|
||||
<div class="card-head">Server — Fan RPM</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
|
||||
<div>
|
||||
<div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
|
||||
<div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
|
||||
</div>
|
||||
<label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
|
||||
<input id="gpu-chart-toggle" type="checkbox">
|
||||
<span>One chart per GPU</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-metric">
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Compute Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Memory Load</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Core Clock</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Power</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">GPU — Temperature</div>
|
||||
<div class="card-body" style="padding:8px">
|
||||
<img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="gpu-metrics-by-gpu" style="display:none"></div>
|
||||
</section>
|
||||
|
||||
<script>
|
||||
let gpuChartKey = '';
|
||||
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
||||
let metricsNvidiaGPUsPromise = null;
|
||||
|
||||
function loadMetricsNvidiaGPUs() {
|
||||
if (!metricsNvidiaGPUsPromise) {
|
||||
metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(function(list) { return Array.isArray(list) ? list : []; })
|
||||
.catch(function() { return []; });
|
||||
}
|
||||
return metricsNvidiaGPUsPromise;
|
||||
}
|
||||
|
||||
function metricsGPUNameMap(list) {
|
||||
const out = {};
|
||||
(list || []).forEach(function(gpu) {
|
||||
const idx = Number(gpu.index);
|
||||
if (!Number.isFinite(idx) || !gpu.name) return;
|
||||
out[idx] = gpu.name;
|
||||
});
|
||||
return out;
|
||||
}
|
||||
|
||||
function metricsGPUDisplayLabel(idx, names) {
|
||||
const name = names && names[idx];
|
||||
return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
|
||||
}
|
||||
|
||||
function loadGPUChartModePreference() {
|
||||
try {
|
||||
return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function saveGPUChartModePreference(perGPU) {
|
||||
try {
|
||||
sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
|
||||
} catch (_) {}
|
||||
}
|
||||
|
||||
function refreshChartImage(el) {
|
||||
if (!el || el.dataset.loading === '1') return;
|
||||
if (el.offsetParent === null) return;
|
||||
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||
const probe = new Image();
|
||||
el.dataset.baseSrc = baseSrc;
|
||||
el.dataset.loading = '1';
|
||||
probe.onload = function() {
|
||||
el.src = nextSrc;
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.onerror = function() {
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.src = nextSrc;
|
||||
}
|
||||
|
||||
function refreshCharts() {
|
||||
document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
|
||||
}
|
||||
|
||||
function gpuIndices(rows) {
|
||||
const seen = {};
|
||||
const out = [];
|
||||
(rows || []).forEach(function(row) {
|
||||
const idx = Number(row.index);
|
||||
if (!Number.isFinite(idx) || seen[idx]) return;
|
||||
seen[idx] = true;
|
||||
out.push(idx);
|
||||
});
|
||||
return out.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function renderGPUOverviewCards(indices, names) {
|
||||
const host = document.getElementById('gpu-metrics-by-gpu');
|
||||
if (!host) return;
|
||||
host.innerHTML = indices.map(function(idx) {
|
||||
const label = metricsGPUDisplayLabel(idx, names);
|
||||
return '<div class="card" style="margin-bottom:16px">' +
|
||||
'<div class="card-head">' + label + ' — Overview</div>' +
|
||||
'<div class="card-body" style="padding:8px">' +
|
||||
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
|
||||
'</div></div>';
|
||||
}).join('');
|
||||
}
|
||||
|
||||
function applyGPUChartMode() {
|
||||
const perMetric = document.getElementById('gpu-metrics-by-metric');
|
||||
const perGPU = document.getElementById('gpu-metrics-by-gpu');
|
||||
const toggle = document.getElementById('gpu-chart-toggle');
|
||||
const gpuModePerGPU = !!(toggle && toggle.checked);
|
||||
if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
|
||||
if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
|
||||
}
|
||||
|
||||
function syncMetricsLayout(d) {
|
||||
const fanCard = document.getElementById('card-server-fans');
|
||||
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||
const section = document.getElementById('gpu-metrics-section');
|
||||
const summary = document.getElementById('gpu-metrics-summary');
|
||||
const indices = gpuIndices(d.gpus);
|
||||
loadMetricsNvidiaGPUs().then(function(gpus) {
|
||||
const names = metricsGPUNameMap(gpus);
|
||||
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
||||
if (summary) {
|
||||
summary.textContent = indices.length > 0
|
||||
? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
|
||||
: 'No GPUs detected in live metrics.';
|
||||
}
|
||||
const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
|
||||
if (nextKey !== gpuChartKey) {
|
||||
renderGPUOverviewCards(indices, names);
|
||||
gpuChartKey = nextKey;
|
||||
}
|
||||
applyGPUChartMode();
|
||||
});
|
||||
}
|
||||
|
||||
function loadMetricsLayout() {
|
||||
fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
|
||||
}
|
||||
|
||||
const gpuChartToggle = document.getElementById('gpu-chart-toggle');
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.checked = loadGPUChartModePreference();
|
||||
}
|
||||
applyGPUChartMode();
|
||||
|
||||
if (gpuChartToggle) {
|
||||
gpuChartToggle.addEventListener('change', function() {
|
||||
saveGPUChartModePreference(!!gpuChartToggle.checked);
|
||||
applyGPUChartMode();
|
||||
refreshCharts();
|
||||
});
|
||||
}
|
||||
|
||||
loadMetricsLayout();
|
||||
setInterval(refreshCharts, 3000);
|
||||
setInterval(loadMetricsLayout, 5000);
|
||||
</script>`
|
||||
}
|
||||
213
audit/internal/webui/page_network_services.go
Normal file
213
audit/internal/webui/page_network_services.go
Normal file
@@ -0,0 +1,213 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
|
||||
func renderNetworkInline() string {
|
||||
return `<div id="net-pending" style="display:none" class="alert alert-warn">
|
||||
<strong>⚠ Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
|
||||
<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
|
||||
<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
|
||||
</div>
|
||||
<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div class="grid2" style="margin-top:16px">
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
|
||||
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
|
||||
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||
<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
|
||||
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||
<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
var _netCountdownTimer = null;
|
||||
var _netRefreshTimer = null;
|
||||
const NET_ROLLBACK_SECS = 60;
|
||||
function loadNetwork() {
|
||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||
const rows = (d.interfaces||[]).map(i =>
|
||||
'<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
|
||||
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||
).join('');
|
||||
document.getElementById('iface-table').innerHTML =
|
||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
else hideNetPending();
|
||||
}).catch(function() {});
|
||||
}
|
||||
function selectIface(iface) {
|
||||
document.getElementById('dhcp-iface').value = iface;
|
||||
document.getElementById('st-iface').value = iface;
|
||||
}
|
||||
function toggleIface(iface, currentState) {
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||
loadNetwork();
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function hideNetPending() {
|
||||
const el = document.getElementById('net-pending');
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
_netCountdownTimer = null;
|
||||
el.style.display = 'none';
|
||||
}
|
||||
function showNetPending(secs) {
|
||||
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||
const el = document.getElementById('net-pending');
|
||||
el.style.display = 'block';
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
let remaining = secs;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
_netCountdownTimer = setInterval(function() {
|
||||
remaining--;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||
}, 1000);
|
||||
}
|
||||
function confirmNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function rollbackNetChange() {
|
||||
hideNetPending();
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function runDHCP() {
|
||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function setStatic() {
|
||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||
interface: document.getElementById('st-iface').value,
|
||||
address: document.getElementById('st-addr').value,
|
||||
prefix: document.getElementById('st-prefix').value,
|
||||
gateway: document.getElementById('st-gw').value,
|
||||
dns: dns,
|
||||
})}).then(r=>r.json()).then(d => {
|
||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
loadNetwork();
|
||||
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNetwork() string {
|
||||
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
|
||||
renderNetworkInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:12px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="svc-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
<script>
|
||||
function loadServices() {
|
||||
fetch('/api/services').then(r=>r.json()).then(svcs => {
|
||||
const rows = svcs.map(s => {
|
||||
const st = s.state||'unknown';
|
||||
const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
|
||||
const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
|
||||
const body = (s.body||'').replace(/</g,'<').replace(/>/g,'>');
|
||||
return '<tr>' +
|
||||
'<td style="white-space:nowrap">'+s.name+'</td>' +
|
||||
'<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
|
||||
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||
'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||
'</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('svc-table').innerHTML =
|
||||
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
function toggleBody(id) {
|
||||
const el = document.getElementById(id);
|
||||
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||
}
|
||||
function svcAction(btn, name, action) {
|
||||
var label = btn.textContent;
|
||||
btn.disabled = true;
|
||||
btn.textContent = '...';
|
||||
var out = document.getElementById('svc-out');
|
||||
var term = document.getElementById('svc-terminal');
|
||||
var statusEl = document.getElementById('svc-out-status');
|
||||
var labelEl = document.getElementById('svc-out-label');
|
||||
out.style.display = 'block';
|
||||
labelEl.textContent = action + ' ' + name;
|
||||
term.textContent = 'Running...';
|
||||
statusEl.textContent = '';
|
||||
statusEl.style.color = '';
|
||||
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
term.textContent = d.output || d.error || '(no output)';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
if (d.status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||
} else {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
}
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
setTimeout(loadServices, 800);
|
||||
}).catch(e => {
|
||||
term.textContent = 'Request failed: ' + e;
|
||||
statusEl.textContent = '✗ error';
|
||||
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||
btn.textContent = label;
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderServices() string {
|
||||
return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
|
||||
renderServicesInline() +
|
||||
`</div></div>`
|
||||
}
|
||||
716
audit/internal/webui/page_validate.go
Normal file
716
audit/internal/webui/page_validate.go
Normal file
@@ -0,0 +1,716 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
Storage string
|
||||
NVIDIA string
|
||||
AMD string
|
||||
NvidiaGPUCount int
|
||||
AMDGPUCount int
|
||||
}
|
||||
|
||||
func validateFmtDur(secs int) string {
|
||||
if secs < 120 {
|
||||
return fmt.Sprintf("~%d s", secs)
|
||||
}
|
||||
mins := (secs + 29) / 60
|
||||
return fmt.Sprintf("~%d min", mins)
|
||||
}
|
||||
|
||||
func validateTotalValidateSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUValidateSec +
|
||||
platform.SATEstimatedMemoryValidateSec +
|
||||
n*platform.SATEstimatedNvidiaGPUValidatePerGPUSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
return total
|
||||
}
|
||||
|
||||
func validateTotalStressSec(n int) int {
|
||||
if n < 0 {
|
||||
n = 0
|
||||
}
|
||||
total := platform.SATEstimatedCPUStressSec +
|
||||
platform.SATEstimatedMemoryStressSec +
|
||||
n*platform.SATEstimatedNvidiaGPUStressPerGPUSec +
|
||||
n*platform.SATEstimatedNvidiaTargetedStressPerGPUSec +
|
||||
n*platform.SATEstimatedNvidiaTargetedPowerPerGPUSec +
|
||||
platform.SATEstimatedNvidiaPulseTestSec +
|
||||
platform.SATEstimatedNvidiaInterconnectSec +
|
||||
platform.SATEstimatedNvidiaBandwidthSec
|
||||
return total
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
func() string {
|
||||
perV := platform.SATEstimatedNvidiaGPUValidatePerGPUSec
|
||||
perS := platform.SATEstimatedNvidiaGPUStressPerGPUSec
|
||||
if n > 0 {
|
||||
return fmt.Sprintf("Validate: %s/GPU × %d = %s (Level 2, sequential). Stress: %s/GPU × %d = %s (Level 3, sequential).",
|
||||
validateFmtDur(perV), n, validateFmtDur(perV*n),
|
||||
validateFmtDur(perS), n, validateFmtDur(perS*n))
|
||||
}
|
||||
return fmt.Sprintf("Validate: %s/GPU (Level 2, sequential). Stress: %s/GPU (Level 3, sequential).",
|
||||
validateFmtDur(perV), validateFmtDur(perS))
|
||||
}(),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
func() string {
|
||||
per := platform.SATEstimatedNvidiaTargetedStressPerGPUSec
|
||||
s := "Skipped in Validate. "
|
||||
if n > 0 {
|
||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||
} else {
|
||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||
}
|
||||
return s + `<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||
}(),
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
func() string {
|
||||
per := platform.SATEstimatedNvidiaTargetedPowerPerGPUSec
|
||||
s := "Skipped in Validate. "
|
||||
if n > 0 {
|
||||
s += fmt.Sprintf("Stress: %s/GPU × %d = %s sequential.", validateFmtDur(per), n, validateFmtDur(per*n))
|
||||
} else {
|
||||
s += fmt.Sprintf("Stress: %s/GPU sequential.", validateFmtDur(per))
|
||||
}
|
||||
return s + `<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`
|
||||
}(),
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||
.then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
})
|
||||
.then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(function(el) { return el.checked && !el.disabled; })
|
||||
.map(function(el) { return parseInt(el.value, 10); })
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
return;
|
||||
}
|
||||
root.innerHTML = gpus.map(function(gpu) {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row">'
|
||||
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||
+ '</label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectNoGPUs() {
|
||||
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satLoadGPUs() {
|
||||
loadSatNvidiaGPUs().then(function(gpus) {
|
||||
satRenderGPUList(gpus);
|
||||
}).catch(function(err) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) {
|
||||
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
}
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satGPUDisplayName(gpu) {
|
||||
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||
return 'GPU ' + idx + ' — ' + name;
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
body.stress_mode = satStressMode();
|
||||
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||
if (overrides) {
|
||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||
}
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||
.then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) {
|
||||
term.textContent = '';
|
||||
}
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(function(resolve) {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', function(e) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = function() {
|
||||
if (satES) {
|
||||
satES.close();
|
||||
satES = null;
|
||||
}
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) {
|
||||
return runSATWithOverrides(target, null);
|
||||
}
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
||||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
function satAllGPUIndicesForMulti() {
|
||||
return Promise.resolve(satSelectedGPUIndices());
|
||||
}
|
||||
function expandSATTarget(target) {
|
||||
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||
});
|
||||
}
|
||||
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||
return Promise.resolve([{target: target}]);
|
||||
}
|
||||
const selected = satSelectedGPUIndices();
|
||||
if (!selected.length) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||
target: target,
|
||||
overrides: {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||
},
|
||||
label: satGPUDisplayName(gpu),
|
||||
})));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
return loadSatNvidiaGPUs().then(gpus => {
|
||||
const selected = satSelectedGPUIndices();
|
||||
const picked = gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0);
|
||||
if (!picked.length) {
|
||||
throw new Error('Select at least one NVIDIA GPU.');
|
||||
}
|
||||
if (picked.length === 1) {
|
||||
const gpu = picked[0];
|
||||
return runSATWithOverrides(target, {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')',
|
||||
});
|
||||
}
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— ' + target;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Running ' + target + ' one GPU at a time...\n';
|
||||
const labelBase = satLabels()[target] || ('Validate ' + target);
|
||||
const runNext = (idx) => {
|
||||
if (idx >= picked.length) return Promise.resolve();
|
||||
const gpu = picked[idx];
|
||||
const gpuLabel = satGPUDisplayName(gpu);
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + picked.length + '] ' + gpuLabel + '\n';
|
||||
return enqueueSATTarget(target, {
|
||||
gpu_indices: [Number(gpu.index)],
|
||||
display_name: labelBase + ' (' + gpuLabel + ')',
|
||||
}).then(d => {
|
||||
return streamSATTask(d.task_id, labelBase + ' (' + gpuLabel + ')', false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
document.getElementById('sat-output').style.display='block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Running AMD validate set one by one...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = (idx) => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const target = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||
return enqueueSATTarget(target)
|
||||
.then(d => {
|
||||
return streamSATTask(d.task_id, labels[target], false);
|
||||
}).then(function() {
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = 1;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
return !(btn && btn.disabled);
|
||||
});
|
||||
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||||
const expanded = [];
|
||||
for (let cycle = 0; cycle < cycles; cycle++) {
|
||||
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||||
}
|
||||
const total = expanded.length;
|
||||
let enqueued = 0;
|
||||
if (!total) {
|
||||
status.textContent = 'No tasks selected.';
|
||||
return;
|
||||
}
|
||||
const runNext = (idx) => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides)
|
||||
.then(() => {
|
||||
enqueued++;
|
||||
return runNext(idx + 1);
|
||||
});
|
||||
};
|
||||
return runNext(0);
|
||||
}).catch(err => {
|
||||
status.textContent = 'Error: ' + err.message;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
satLoadGPUs();
|
||||
function disableSATAMDOptions(reason) {
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||
const cb = document.getElementById(id);
|
||||
if (!cb) return;
|
||||
cb.disabled = true;
|
||||
cb.checked = false;
|
||||
cb.title = reason;
|
||||
});
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||
unknown := "Audit snapshot not loaded."
|
||||
out := validateInventory{
|
||||
CPU: unknown,
|
||||
Memory: unknown,
|
||||
Storage: unknown,
|
||||
NVIDIA: unknown,
|
||||
AMD: unknown,
|
||||
}
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &snap); err != nil {
|
||||
return out
|
||||
}
|
||||
|
||||
cpuCounts := map[string]int{}
|
||||
cpuTotal := 0
|
||||
for _, cpu := range snap.Hardware.CPUs {
|
||||
if cpu.Present != nil && !*cpu.Present {
|
||||
continue
|
||||
}
|
||||
cpuTotal++
|
||||
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
memCounts := map[string]int{}
|
||||
memTotal := 0
|
||||
for _, dimm := range snap.Hardware.Memory {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
memTotal++
|
||||
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
storageCounts := map[string]int{}
|
||||
storageTotal := 0
|
||||
for _, dev := range snap.Hardware.Storage {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
storageTotal++
|
||||
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
|
||||
nvidiaCounts := map[string]int{}
|
||||
nvidiaTotal := 0
|
||||
amdCounts := map[string]int{}
|
||||
amdTotal := 0
|
||||
for _, dev := range snap.Hardware.PCIeDevices {
|
||||
if dev.Present != nil && !*dev.Present {
|
||||
continue
|
||||
}
|
||||
if validateIsVendorGPU(dev, "nvidia") {
|
||||
nvidiaTotal++
|
||||
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
if validateIsVendorGPU(dev, "amd") {
|
||||
amdTotal++
|
||||
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||
}
|
||||
}
|
||||
|
||||
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||||
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||||
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||
out.NvidiaGPUCount = nvidiaTotal
|
||||
out.AMDGPUCount = amdTotal
|
||||
return out
|
||||
}
|
||||
|
||||
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||||
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||||
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||||
}
|
||||
|
||||
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||||
if total == 0 {
|
||||
return "0 " + unit + "s detected."
|
||||
}
|
||||
keys := make([]string, 0, len(models))
|
||||
for key := range models {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||||
}
|
||||
label := unit
|
||||
if total != 1 {
|
||||
label += "s"
|
||||
}
|
||||
if len(parts) == 1 {
|
||||
return parts[0] + " " + label
|
||||
}
|
||||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
func addValidateModel(counts map[string]int, name string) {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
name = "unknown"
|
||||
}
|
||||
counts[name]++
|
||||
}
|
||||
|
||||
func validateTrimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func validateFirstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||
return false
|
||||
}
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||
case "amd":
|
||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
actions += headerActions
|
||||
}
|
||||
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||||
label, actions, body)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
41
audit/internal/webui/serial_console.go
Normal file
41
audit/internal/webui/serial_console.go
Normal file
@@ -0,0 +1,41 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var taskSerialWriteLine = writeTaskSerialLine
|
||||
|
||||
func writeTaskSerialLine(line string) {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
return
|
||||
}
|
||||
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
|
||||
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
|
||||
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
_, _ = f.WriteString(payload)
|
||||
_ = f.Close()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func taskSerialPrefix(t *Task) string {
|
||||
if t == nil {
|
||||
return "[task] "
|
||||
}
|
||||
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
|
||||
}
|
||||
|
||||
func taskSerialEvent(t *Task, event string) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
71
audit/internal/webui/stability.go
Normal file
71
audit/internal/webui/stability.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"runtime/debug"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
recoverLoopMaxDelay = 60 * time.Second
|
||||
recoverLoopResetAfter = 30 * time.Second
|
||||
)
|
||||
|
||||
// goRecoverLoop starts fn in a goroutine, restarting after panics.
|
||||
// restartDelay is the initial delay; successive panics double it up to
|
||||
// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
|
||||
// successfully for recoverLoopResetAfter without panicking.
|
||||
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||
go func() {
|
||||
delay := restartDelay
|
||||
consecutive := 0
|
||||
for {
|
||||
start := time.Now()
|
||||
panicked := runRecoverable(name, fn)
|
||||
if !panicked {
|
||||
return
|
||||
}
|
||||
consecutive++
|
||||
if time.Since(start) >= recoverLoopResetAfter {
|
||||
delay = restartDelay
|
||||
consecutive = 1
|
||||
}
|
||||
slog.Warn("goroutine restarting after panic",
|
||||
"component", name,
|
||||
"consecutive_panics", consecutive,
|
||||
"next_delay", delay,
|
||||
)
|
||||
if delay > 0 {
|
||||
time.Sleep(delay)
|
||||
}
|
||||
if delay < recoverLoopMaxDelay {
|
||||
delay *= 2
|
||||
if delay > recoverLoopMaxDelay {
|
||||
delay = recoverLoopMaxDelay
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func goRecoverOnce(name string, fn func()) {
|
||||
go func() {
|
||||
_ = runRecoverable(name, fn)
|
||||
}()
|
||||
}
|
||||
|
||||
func runRecoverable(name string, fn func()) (panicked bool) {
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
panicked = true
|
||||
slog.Error("recovered panic",
|
||||
"component", name,
|
||||
"panic", fmt.Sprint(rec),
|
||||
"stack", string(debug.Stack()),
|
||||
)
|
||||
}
|
||||
}()
|
||||
fn()
|
||||
return false
|
||||
}
|
||||
267
audit/internal/webui/task_page.go
Normal file
267
audit/internal/webui/task_page.go
Normal file
@@ -0,0 +1,267 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
task, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
snapshot := *task
|
||||
body := renderTaskDetailPage(h.opts, snapshot)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write([]byte(body))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
type taskChartIndexEntry struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
entries := make([]taskChartIndexEntry, 0)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
_ = json.NewEncoder(w).Encode(entries)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
|
||||
path, ok := taskChartPathFromFile(file)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
|
||||
if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||
title := task.Name
|
||||
if strings.TrimSpace(title) == "" {
|
||||
title = task.ID
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
|
||||
}
|
||||
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||
body.WriteString(`</div>`)
|
||||
|
||||
if report := loadTaskReportFragment(task); report != "" {
|
||||
body.WriteString(report)
|
||||
} else {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
|
||||
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
|
||||
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
|
||||
if strings.TrimSpace(task.ErrMsg) != "" {
|
||||
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
|
||||
}
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
body.WriteString(`<script>
|
||||
function cancelTaskDetail(id) {
|
||||
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||
var term = document.getElementById('task-live-log');
|
||||
if (term) {
|
||||
term.textContent += '\nCancel requested.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
});
|
||||
}
|
||||
function renderTaskLiveCharts(taskId, charts) {
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (!host) return;
|
||||
if (!Array.isArray(charts) || charts.length === 0) {
|
||||
host.innerHTML = 'Waiting for metric samples...';
|
||||
return;
|
||||
}
|
||||
const seen = {};
|
||||
charts.forEach(function(chart) {
|
||||
seen[chart.file] = true;
|
||||
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||
if (img) {
|
||||
const card = img.closest('.card');
|
||||
if (card) {
|
||||
const title = card.querySelector('.card-head');
|
||||
if (title) title.textContent = chart.title;
|
||||
}
|
||||
return;
|
||||
}
|
||||
const card = document.createElement('div');
|
||||
card.className = 'card';
|
||||
card.style.margin = '0';
|
||||
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||
card.querySelector('.card-head').textContent = chart.title;
|
||||
const body = card.querySelector('.card-body');
|
||||
img = document.createElement('img');
|
||||
img.setAttribute('data-task-chart', '1');
|
||||
img.setAttribute('data-chart-file', chart.file);
|
||||
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||
img.style.width = '100%';
|
||||
img.style.display = 'block';
|
||||
img.style.borderRadius = '6px';
|
||||
img.alt = chart.title;
|
||||
body.appendChild(img);
|
||||
host.appendChild(card);
|
||||
});
|
||||
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||
const file = img.getAttribute('data-chart-file') || '';
|
||||
if (seen[file]) return;
|
||||
const card = img.closest('.card');
|
||||
if (card) card.remove();
|
||||
});
|
||||
}
|
||||
function loadTaskLiveCharts(taskId) {
|
||||
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||
renderTaskLiveCharts(taskId, charts);
|
||||
}).catch(function(){
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||
});
|
||||
}
|
||||
function refreshTaskLiveCharts() {
|
||||
document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
|
||||
const base = img.dataset.baseSrc;
|
||||
if (!base) return;
|
||||
img.src = base + '?t=' + Date.now();
|
||||
});
|
||||
}
|
||||
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||
var _taskChartTimer = null;
|
||||
var _taskChartsFrozen = false;
|
||||
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||
_taskDetailES.addEventListener('done', function(e){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
_taskChartsFrozen = true;
|
||||
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||
refreshTaskLiveCharts();
|
||||
});
|
||||
_taskDetailES.onerror = function(){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
if (_taskDetailES) {
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
}
|
||||
};
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
_taskChartTimer = setInterval(function(){
|
||||
if (_taskChartsFrozen) return;
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
refreshTaskLiveCharts();
|
||||
}, 2000);
|
||||
</script>`)
|
||||
}
|
||||
|
||||
return layoutHead(opts.Title+" — "+title) +
|
||||
layoutNav("tasks", opts.BuildLabel) +
|
||||
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||
body.String() +
|
||||
`</div></div></body></html>`
|
||||
}
|
||||
|
||||
func loadTaskReportFragment(task Task) string {
|
||||
if strings.TrimSpace(task.ReportHTMLPath) == "" {
|
||||
return ""
|
||||
}
|
||||
data, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return ""
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func taskArtifactDownloadLink(task Task, absPath string) string {
|
||||
if strings.TrimSpace(absPath) == "" {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||
}
|
||||
|
||||
func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
|
||||
id := r.PathValue("id")
|
||||
taskPtr, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
return Task{}, nil, time.Time{}, time.Time{}, false
|
||||
}
|
||||
task := *taskPtr
|
||||
start, end := taskTimeWindow(&task)
|
||||
samples, err := loadTaskMetricSamples(start, end)
|
||||
if err != nil {
|
||||
return task, nil, start, end, true
|
||||
}
|
||||
return task, samples, start, end, true
|
||||
}
|
||||
|
||||
func taskTimelineForTask(task Task) []chartTimelineSegment {
|
||||
start, end := taskTimeWindow(&task)
|
||||
return []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
}
|
||||
|
||||
func taskChartPathFromFile(file string) (string, bool) {
|
||||
file = strings.TrimSpace(file)
|
||||
for _, spec := range taskDashboardChartSpecs {
|
||||
if spec.File == file {
|
||||
return spec.Path, true
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
|
||||
id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
|
||||
return "gpu/" + id + "-overview", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
371
audit/internal/webui/task_report.go
Normal file
371
audit/internal/webui/task_report.go
Normal file
@@ -0,0 +1,371 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
var taskReportMetricsDBPath = metricsDBPath
|
||||
|
||||
type taskReport struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
DurationSec int `json:"duration_sec,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
LogFile string `json:"log_file,omitempty"`
|
||||
Charts []taskReportChart `json:"charts,omitempty"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
}
|
||||
|
||||
type taskReportChart struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
|
||||
type taskChartSpec struct {
|
||||
Path string
|
||||
File string
|
||||
}
|
||||
|
||||
var taskDashboardChartSpecs = []taskChartSpec{
|
||||
{Path: "server-load", File: "server-load.svg"},
|
||||
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
|
||||
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
|
||||
{Path: "server-power", File: "server-power.svg"},
|
||||
{Path: "server-fans", File: "server-fans.svg"},
|
||||
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
|
||||
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
|
||||
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
|
||||
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
|
||||
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||
}
|
||||
|
||||
func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
|
||||
specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
|
||||
specs = append(specs, taskDashboardChartSpecs...)
|
||||
for _, idx := range taskGPUIndices(samples) {
|
||||
specs = append(specs, taskChartSpec{
|
||||
Path: fmt.Sprintf("gpu/%d-overview", idx),
|
||||
File: fmt.Sprintf("gpu-%d-overview.svg", idx),
|
||||
})
|
||||
}
|
||||
return specs
|
||||
}
|
||||
|
||||
func writeTaskReportArtifacts(t *Task) error {
|
||||
if t == nil {
|
||||
return nil
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
start, end := taskTimeWindow(t)
|
||||
samples, _ := loadTaskMetricSamples(start, end)
|
||||
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
|
||||
|
||||
logText := ""
|
||||
if data, err := os.ReadFile(t.LogPath); err == nil {
|
||||
logText = string(data)
|
||||
}
|
||||
|
||||
report := taskReport{
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
|
||||
Error: t.ErrMsg,
|
||||
LogFile: filepath.Base(t.LogPath),
|
||||
Charts: charts,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
}
|
||||
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
|
||||
}
|
||||
|
||||
func reportDoneTime(t *Task) time.Time {
|
||||
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
return *t.DoneAt
|
||||
}
|
||||
return time.Now()
|
||||
}
|
||||
|
||||
func taskTimeWindow(t *Task) (time.Time, time.Time) {
|
||||
if t == nil {
|
||||
now := time.Now().UTC()
|
||||
return now, now
|
||||
}
|
||||
start := t.CreatedAt.UTC()
|
||||
if t.StartedAt != nil && !t.StartedAt.IsZero() {
|
||||
start = t.StartedAt.UTC()
|
||||
}
|
||||
end := time.Now().UTC()
|
||||
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
end = t.DoneAt.UTC()
|
||||
}
|
||||
if end.Before(start) {
|
||||
end = start
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
db, err := openMetricsDB(taskReportMetricsDBPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer db.Close()
|
||||
return db.LoadBetween(start, end)
|
||||
}
|
||||
|
||||
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
|
||||
if len(samples) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
var charts []taskReportChart
|
||||
inline := make(map[string]string)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||
if !ok || len(svg) == 0 {
|
||||
continue
|
||||
}
|
||||
path := filepath.Join(dir, spec.File)
|
||||
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||
inline[spec.File] = string(svg)
|
||||
}
|
||||
return charts, inline
|
||||
}
|
||||
|
||||
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||
buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
if err != nil || !hasData {
|
||||
return "", nil, false
|
||||
}
|
||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
return "", nil, false
|
||||
}
|
||||
var buf []byte
|
||||
var err error
|
||||
if stacked {
|
||||
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
} else {
|
||||
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
}
|
||||
if err != nil {
|
||||
return "", nil, false
|
||||
}
|
||||
return title, buf, true
|
||||
}
|
||||
|
||||
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
|
||||
seen := map[int]bool{}
|
||||
var out []int
|
||||
for _, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if seen[g.GPUIndex] {
|
||||
continue
|
||||
}
|
||||
seen[g.GPUIndex] = true
|
||||
out = append(out, g.GPUIndex)
|
||||
}
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func writeJSONFile(path string, v any) error {
|
||||
data, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
|
||||
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="grid2">`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
|
||||
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
|
||||
if strings.TrimSpace(report.Error) != "" {
|
||||
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
|
||||
}
|
||||
b.WriteString(`</div></div>`)
|
||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||
b.WriteString(`</div></div></div>`)
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||
b.WriteString(charts[chart.File])
|
||||
b.WriteString(`</div></div>`)
|
||||
}
|
||||
} else {
|
||||
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||
}
|
||||
|
||||
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia-bench-perf":
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||
if len(runs) == 0 {
|
||||
return ""
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Perf Results",
|
||||
"Composite score for this benchmark task.",
|
||||
"No benchmark results were saved for this task.",
|
||||
columns,
|
||||
runs,
|
||||
)
|
||||
}
|
||||
|
||||
func renderTaskPowerResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
raw, err := os.ReadFile(resultPath)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
var result platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||
}
|
||||
b.WriteString(`</table></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
func taskArchivePathFromLog(logText string) string {
|
||||
lines := strings.Split(logText, "\n")
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.HasPrefix(path, "Archive written to ") {
|
||||
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||
}
|
||||
if strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func renderTaskStatusBadge(status string) string {
|
||||
className := map[string]string{
|
||||
TaskRunning: "badge-ok",
|
||||
TaskPending: "badge-unknown",
|
||||
TaskDone: "badge-ok",
|
||||
TaskFailed: "badge-err",
|
||||
TaskCancelled: "badge-unknown",
|
||||
}[status]
|
||||
if className == "" {
|
||||
className = "badge-unknown"
|
||||
}
|
||||
label := strings.TrimSpace(status)
|
||||
if label == "" {
|
||||
label = "unknown"
|
||||
}
|
||||
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
|
||||
}
|
||||
|
||||
func formatTaskTime(ts *time.Time, fallback time.Time) string {
|
||||
if ts != nil && !ts.IsZero() {
|
||||
return ts.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
if !fallback.IsZero() {
|
||||
return fallback.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
return "n/a"
|
||||
}
|
||||
|
||||
func formatTaskDuration(sec int) string {
|
||||
if sec <= 0 {
|
||||
return "n/a"
|
||||
}
|
||||
if sec < 60 {
|
||||
return fmt.Sprintf("%ds", sec)
|
||||
}
|
||||
if sec < 3600 {
|
||||
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
|
||||
}
|
||||
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,12 +2,18 @@ package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
@@ -22,21 +28,34 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
|
||||
started := time.Now().Add(-time.Minute)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
|
||||
// A task that was pending (not yet started) must be re-queued on restart.
|
||||
pendingTask := &Task{
|
||||
ID: "task-pending",
|
||||
Name: "Memory Burn-in",
|
||||
Target: "memory-stress",
|
||||
Priority: 2,
|
||||
Status: TaskRunning,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{
|
||||
Duration: 300,
|
||||
BurnProfile: "smoke",
|
||||
},
|
||||
params: taskParams{Duration: 300, BurnProfile: "smoke"},
|
||||
}
|
||||
// A task that was running when bee-web crashed must NOT be re-queued —
|
||||
// its child processes (e.g. gpu-burn-worker) survive the restart in
|
||||
// their own process groups and can't be cancelled retroactively.
|
||||
runningTask := &Task{
|
||||
ID: "task-running",
|
||||
Name: "NVIDIA GPU Stress",
|
||||
Target: "nvidia-stress",
|
||||
Priority: 1,
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now().Add(-3 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{Duration: 86400},
|
||||
}
|
||||
for _, task := range []*Task{pendingTask, runningTask} {
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
}
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
q.persistLocked()
|
||||
|
||||
recovered := &taskQueue{
|
||||
@@ -46,18 +65,47 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
recovered.loadLocked()
|
||||
|
||||
if len(recovered.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(recovered.tasks))
|
||||
if len(recovered.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(recovered.tasks))
|
||||
}
|
||||
got := recovered.tasks[0]
|
||||
if got.Status != TaskPending {
|
||||
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||
|
||||
byID := map[string]*Task{}
|
||||
for i := range recovered.tasks {
|
||||
byID[recovered.tasks[i].ID] = recovered.tasks[i]
|
||||
}
|
||||
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("params=%+v", got.params)
|
||||
|
||||
// Pending task must be re-queued as pending with params intact.
|
||||
p := byID["task-pending"]
|
||||
if p == nil {
|
||||
t.Fatal("task-pending not found")
|
||||
}
|
||||
if got.LogPath == "" {
|
||||
t.Fatal("expected log path")
|
||||
if p.Status != TaskPending {
|
||||
t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
|
||||
}
|
||||
if p.StartedAt != nil {
|
||||
t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
|
||||
}
|
||||
if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("pending task: params=%+v", p.params)
|
||||
}
|
||||
if p.LogPath == "" {
|
||||
t.Fatal("pending task: expected log path")
|
||||
}
|
||||
|
||||
// Running task must be marked failed, not re-queued, to prevent
|
||||
// launching duplicate workers (e.g. a second set of gpu-burn-workers).
|
||||
r := byID["task-running"]
|
||||
if r == nil {
|
||||
t.Fatal("task-running not found")
|
||||
}
|
||||
if r.Status != TaskFailed {
|
||||
t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
|
||||
}
|
||||
if r.ErrMsg == "" {
|
||||
t.Fatal("running task: expected non-empty error message")
|
||||
}
|
||||
if r.DoneAt == nil {
|
||||
t.Fatal("running task: expected done_at to be set")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,15 +126,363 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
tasks: []*Task{
|
||||
{
|
||||
ID: "old-running",
|
||||
Name: "Old Running",
|
||||
Status: TaskRunning,
|
||||
Priority: 10,
|
||||
CreatedAt: now.Add(-3 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "new-done",
|
||||
Name: "New Done",
|
||||
Status: TaskDone,
|
||||
Priority: 0,
|
||||
CreatedAt: now.Add(-1 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "mid-pending",
|
||||
Name: "Mid Pending",
|
||||
Status: TaskPending,
|
||||
Priority: 1,
|
||||
CreatedAt: now.Add(-2 * time.Minute),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := q.snapshot()
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("snapshot len=%d want 3", len(got))
|
||||
}
|
||||
if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
|
||||
t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
origCounter := jobCounter.Load()
|
||||
jobCounter.Store(0)
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
jobCounter.Store(origCounter)
|
||||
})
|
||||
|
||||
if got := newJobID("ignored"); got != "TASK-000" {
|
||||
t.Fatalf("id=%q want TASK-000", got)
|
||||
}
|
||||
if got := newJobID("ignored"); got != "TASK-001" {
|
||||
t.Fatalf("id=%q want TASK-001", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
task := &Task{
|
||||
ID: "TASK-007",
|
||||
Name: "NVIDIA Benchmark",
|
||||
}
|
||||
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||
if !strings.HasPrefix(got, "007_") {
|
||||
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
logPath := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "done-1",
|
||||
Name: "Done Task",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
LogPath: logPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
|
||||
req.SetPathValue("id", "done-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
|
||||
t.Fatalf("body=%q", body)
|
||||
}
|
||||
if !strings.Contains(body, "event: done\n") {
|
||||
t.Fatalf("missing done event: %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "pending-1",
|
||||
Name: "Pending Task",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
|
||||
req.SetPathValue("id", "pending-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
|
||||
cancel()
|
||||
<-done
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
return
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
cancel()
|
||||
<-done
|
||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||
}
|
||||
|
||||
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
db, err := openMetricsDB(metricsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
base := time.Now().UTC().Add(-45 * time.Second)
|
||||
if err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base,
|
||||
CPULoadPct: 42,
|
||||
MemLoadPct: 35,
|
||||
PowerW: 510,
|
||||
}); err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
_ = db.Close()
|
||||
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
started := time.Now().UTC().Add(-90 * time.Second)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: started.Add(-10 * time.Second),
|
||||
StartedAt: &started,
|
||||
}
|
||||
q.assignTaskLogPathLocked(task)
|
||||
appendJobLog(task.LogPath, "line-1")
|
||||
|
||||
job := newTaskJobState(task.LogPath)
|
||||
job.finish("")
|
||||
q.finalizeTaskRun(task, job)
|
||||
|
||||
if task.Status != TaskDone {
|
||||
t.Fatalf("status=%q want %q", task.Status, TaskDone)
|
||||
}
|
||||
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
|
||||
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
|
||||
}
|
||||
if _, err := os.Stat(task.ReportJSONPath); err != nil {
|
||||
t.Fatalf("report json: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
|
||||
t.Fatalf("report html: %v", err)
|
||||
}
|
||||
var report taskReport
|
||||
data, err := os.ReadFile(task.ReportJSONPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile(report.json): %v", err)
|
||||
}
|
||||
if err := json.Unmarshal(data, &report); err != nil {
|
||||
t.Fatalf("Unmarshal(report.json): %v", err)
|
||||
}
|
||||
if report.ID != task.ID || report.Status != TaskDone {
|
||||
t.Fatalf("report=%+v", report)
|
||||
}
|
||||
if len(report.Charts) == 0 {
|
||||
t.Fatalf("expected charts in report, got none")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
|
||||
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
result := platform.NvidiaBenchmarkResult{
|
||||
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||
BenchmarkProfile: "standard",
|
||||
OverallStatus: "OK",
|
||||
GPUs: []platform.BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1176.25,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-bench",
|
||||
Name: "NVIDIA Bee Bench Perf",
|
||||
Target: "nvidia-bench-perf",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||
ArtifactsDir: artifactsDir,
|
||||
}
|
||||
ensureTaskReportPaths(task)
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
|
||||
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := writeTaskReportArtifacts(task); err != nil {
|
||||
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||
}
|
||||
|
||||
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile(report.html): %v", err)
|
||||
}
|
||||
html := string(body)
|
||||
for _, needle := range []string{
|
||||
`Perf Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`GPU 0`,
|
||||
`1176.25`,
|
||||
} {
|
||||
if !strings.Contains(html, needle) {
|
||||
t.Fatalf("report missing %q: %s", needle, html)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||
var lines []string
|
||||
prev := taskSerialWriteLine
|
||||
taskSerialWriteLine = func(line string) { lines = append(lines, line) }
|
||||
t.Cleanup(func() { taskSerialWriteLine = prev })
|
||||
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-serial-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
}
|
||||
|
||||
q.enqueue(task)
|
||||
started := time.Now().UTC()
|
||||
task.Status = TaskRunning
|
||||
task.StartedAt = &started
|
||||
job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||
job.append("Starting CPU SAT...")
|
||||
job.append("CPU stress duration: 60s")
|
||||
job.finish("")
|
||||
q.finalizeTaskRun(task, job)
|
||||
|
||||
joined := strings.Join(lines, "\n")
|
||||
for _, needle := range []string{
|
||||
"queued",
|
||||
"Starting CPU SAT...",
|
||||
"CPU stress duration: 60s",
|
||||
"finished with status=done",
|
||||
} {
|
||||
if !strings.Contains(joined, needle) {
|
||||
t.Fatalf("serial mirror missing %q in %q", needle, joined)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
want burnPreset
|
||||
}{
|
||||
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
||||
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
||||
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
|
||||
{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
|
||||
{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
|
||||
{profile: "", want: burnPreset{DurationSec: 5 * 60}},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||
@@ -95,9 +491,101 @@ func TestResolveBurnPreset(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
t.Parallel()
|
||||
func TestResolveNvidiaRampPlan(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
profile string
|
||||
enabled bool
|
||||
selected []int
|
||||
want nvidiaRampSpec
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "disabled uses base preset",
|
||||
profile: "acceptance",
|
||||
selected: []int{0, 1},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "smoke ramp uses two minute steps",
|
||||
profile: "smoke",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
|
||||
},
|
||||
{
|
||||
name: "acceptance ramp uses ten minute steps",
|
||||
profile: "acceptance",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight stays at eight hours when possible",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight extends to keep one hour after final gpu",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight rejects impossible gpu count",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
|
||||
wantErr: "at most 10 GPUs",
|
||||
},
|
||||
{
|
||||
name: "enabled requires explicit selection",
|
||||
profile: "smoke",
|
||||
enabled: true,
|
||||
wantErr: "requires explicit GPU selection",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
|
||||
if tc.wantErr != "" {
|
||||
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
|
||||
t.Fatalf("err=%v want substring %q", err, tc.wantErr)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("resolveNvidiaRampPlan error: %v", err)
|
||||
}
|
||||
if got != tc.want {
|
||||
t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
|
||||
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
|
||||
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
blocked := make(chan struct{})
|
||||
released := make(chan struct{})
|
||||
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
@@ -154,3 +642,210 @@ func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
t.Fatal("runTask did not return after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
var gotDuration int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-burn-1",
|
||||
Name: "CPU Burn-in",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{BurnProfile: "smoke"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
|
||||
gotDuration = durationSec
|
||||
return "/tmp/cpu-burn.tar.gz", nil
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDuration != 5*60 {
|
||||
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
|
||||
var gotSizeMB, gotPasses int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "mem-validate-1",
|
||||
Name: "Memory SAT",
|
||||
Target: "memory",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{StressMode: true},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runMemoryAcceptancePackCtx
|
||||
runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
|
||||
gotSizeMB = sizeMB
|
||||
gotPasses = passes
|
||||
return "/tmp/memory-validate.tar.gz", nil
|
||||
}
|
||||
defer func() { runMemoryAcceptancePackCtx = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotSizeMB != 512 || gotPasses != 1 {
|
||||
t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{ExportDir: dir},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "support-bundle-1",
|
||||
Name: "Support Bundle",
|
||||
Target: "support-bundle",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotExportDir string
|
||||
orig := buildSupportBundle
|
||||
buildSupportBundle = func(exportDir string) (string, error) {
|
||||
gotExportDir = exportDir
|
||||
return filepath.Join(exportDir, "bundle.tar.gz"), nil
|
||||
}
|
||||
defer func() { buildSupportBundle = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotExportDir != dir {
|
||||
t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
|
||||
t.Fatalf("lines=%v", j.lines)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||
started := time.Time{}
|
||||
task := &Task{
|
||||
Status: TaskRunning,
|
||||
CreatedAt: created,
|
||||
StartedAt: &started,
|
||||
}
|
||||
if got := taskElapsedSec(task, now); got != 0 {
|
||||
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||
}
|
||||
|
||||
stale := created.Add(-24 * time.Hour)
|
||||
task.StartedAt = &stale
|
||||
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "install-1",
|
||||
Name: "Install to Disk",
|
||||
Target: "install",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{Device: "/dev/sda"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotDevice string
|
||||
var gotLogPath string
|
||||
orig := installCommand
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
gotDevice = device
|
||||
gotLogPath = logPath
|
||||
return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
|
||||
}
|
||||
defer func() { installCommand = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDevice != "/dev/sda" {
|
||||
t.Fatalf("device=%q want /dev/sda", gotDevice)
|
||||
}
|
||||
if gotLogPath == "" {
|
||||
t.Fatal("expected install log path")
|
||||
}
|
||||
logs := strings.Join(j.lines, "\n")
|
||||
if !strings.Contains(logs, "Install log: ") {
|
||||
t.Fatalf("missing install log line: %v", j.lines)
|
||||
}
|
||||
if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
|
||||
t.Fatalf("missing streamed output: %v", j.lines)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
kmsgWatcher: newKmsgWatcher(nil),
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-panic-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
panic("boom")
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
q.executeTask(tk, j, context.Background())
|
||||
|
||||
if tk.Status != TaskFailed {
|
||||
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
|
||||
}
|
||||
if tk.DoneAt == nil {
|
||||
t.Fatal("expected done_at to be set")
|
||||
}
|
||||
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
|
||||
t.Fatalf("task error=%q", tk.ErrMsg)
|
||||
}
|
||||
if !strings.Contains(j.err, "task panic: boom") {
|
||||
t.Fatalf("job error=%q", j.err)
|
||||
}
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount := q.kmsgWatcher.activeCount
|
||||
window := q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 0 {
|
||||
t.Fatalf("activeCount=%d want 0", activeCount)
|
||||
}
|
||||
if window != nil {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
16
audit/scripts/resolve-version.sh
Executable file
16
audit/scripts/resolve-version.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
|
||||
case "${tag}" in
|
||||
v*)
|
||||
printf '%s\n' "${tag#v}"
|
||||
;;
|
||||
"")
|
||||
printf 'dev\n'
|
||||
;;
|
||||
*)
|
||||
printf '%s\n' "${tag}"
|
||||
;;
|
||||
esac
|
||||
2
bible
2
bible
Submodule bible updated: 688b87e98d...1d89a4918e
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
|
||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||
There is no client-side canvas or JS chart library.
|
||||
|
||||
## Rule: live charts must be visually uniform
|
||||
|
||||
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||
changes to existing charts must keep the same rendering model and presentation
|
||||
rules unless there is an explicit architectural decision to diverge.
|
||||
|
||||
Default expectations:
|
||||
|
||||
- same server-side SVG pipeline for all live metrics charts
|
||||
- same refresh behaviour and failure handling in the browser
|
||||
- same canvas size class and card layout
|
||||
- same legend placement policy across charts
|
||||
- same axis, title, and summary conventions
|
||||
- no chart-specific visual exceptions added as a quick fix
|
||||
|
||||
Current default for live charts:
|
||||
|
||||
- legend below the plot area when a chart has 8 series or fewer
|
||||
- legend hidden when a chart has more than 8 series
|
||||
- 10 equal Y-axis steps across the chart height
|
||||
- 1400 x 360 SVG canvas with legend
|
||||
- 1400 x 288 SVG canvas without legend
|
||||
- full-width card rendering in a single-column stack
|
||||
|
||||
If one chart needs a different layout or legend behaviour, treat that as a
|
||||
design-level decision affecting the whole chart family, not as a local tweak to
|
||||
just one endpoint.
|
||||
|
||||
### Why go-analyze/charts
|
||||
|
||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
|
||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||
|
||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
||||
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||
the legend is hidden. The page renders them at `width: 100%` in a
|
||||
single-column layout so they always fill the viewport width.
|
||||
|
||||
### Ring buffers
|
||||
|
||||
@@ -60,6 +60,8 @@ Rules:
|
||||
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||
- SSH is independent from the desktop path
|
||||
- serial console support is enabled for VM boot debugging
|
||||
- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
|
||||
- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
|
||||
|
||||
## ISO build sequence
|
||||
|
||||
|
||||
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
@@ -0,0 +1,224 @@
|
||||
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Status:** resolved
|
||||
|
||||
## Context
|
||||
|
||||
We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
|
||||
The commit history shows several distinct attempts:
|
||||
|
||||
- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
|
||||
- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
|
||||
- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
|
||||
- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
|
||||
- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
|
||||
- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
|
||||
|
||||
Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
|
||||
|
||||
- `lb binary_memtest` does run and installs `memtest86+`
|
||||
- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
|
||||
- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
|
||||
|
||||
So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
|
||||
|
||||
Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
|
||||
|
||||
- the build now completes successfully because memtest is non-blocking by default
|
||||
- `lb binary_memtest` still runs and installs `memtest86+`
|
||||
- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
|
||||
- but it executes too early for its current target paths:
|
||||
- `binary/boot/grub/grub.cfg` is still missing at hook time
|
||||
- `binary/isolinux/live.cfg` is still missing at hook time
|
||||
- memtest binaries are also still absent in `binary/boot/`
|
||||
- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
|
||||
- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
|
||||
|
||||
So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
|
||||
|
||||
Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
artifact dated 2026-04-01:
|
||||
|
||||
- the final ISO does contain `boot/memtest86+x64.bin`
|
||||
- the final ISO does contain `boot/memtest86+x64.efi`
|
||||
- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
|
||||
and `isolinux/live.cfg`
|
||||
- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
|
||||
shipped ISO
|
||||
- the regression was in the build-time validator/debug path in `build.sh`
|
||||
|
||||
Root cause of the false alarm:
|
||||
|
||||
- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
|
||||
successfully listed/extracted members"
|
||||
- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
|
||||
observable output as "memtest content missing"
|
||||
- this made a reader failure look identical to a missing memtest payload
|
||||
- as a result, we re-entered the same memtest investigation loop even though
|
||||
the real ISO was already correct
|
||||
|
||||
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||
|
||||
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||
`iso_memtest_present` return code of `1` as fatal
|
||||
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||
the recovery design itself was wrong
|
||||
|
||||
## Known Failed Attempts
|
||||
|
||||
These approaches were already tried and should not be repeated blindly:
|
||||
|
||||
1. Built-in live-build memtest only.
|
||||
Reason it failed:
|
||||
- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
|
||||
|
||||
2. Fixing only the memtest file names for Debian Bookworm.
|
||||
Reason it failed:
|
||||
- correct file names alone do not make the files appear in the final ISO.
|
||||
|
||||
3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
|
||||
Reason it failed:
|
||||
- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
|
||||
|
||||
4. Fallback extraction from cached `memtest86+` `.deb`.
|
||||
Reason it failed:
|
||||
- this was explored already and was not enough to stabilize the final ISO path end-to-end.
|
||||
|
||||
5. Restoring explicit memtest menu entries in source bootloader templates only.
|
||||
Reason it failed:
|
||||
- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
|
||||
|
||||
6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
|
||||
Reason it failed:
|
||||
- the hook runs before those files exist, so the hook cannot patch them there.
|
||||
|
||||
## What This Means
|
||||
|
||||
When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
|
||||
|
||||
- do not assume the built-in memtest stage is sufficient
|
||||
- do not assume `chroot/boot/` will contain memtest payloads
|
||||
- do not assume source bootloader templates are the last writer of final ISO configs
|
||||
- do not assume the current normal binary hook timing is late enough for final patching
|
||||
|
||||
Any future memtest fix must explicitly identify:
|
||||
|
||||
- where the memtest binaries are reliably available at build time
|
||||
- which exact build stage writes the final bootloader configs that land in the ISO
|
||||
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||
the validator printed a memtest warning
|
||||
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||
context rather than accidentally tripping `set -e`
|
||||
|
||||
## Decision
|
||||
|
||||
For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
|
||||
|
||||
Project rules from now on:
|
||||
|
||||
- Do **not** trust `--memtest memtest86+` by itself.
|
||||
- A memtest implementation is considered valid only if the produced ISO actually contains:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a GRUB menu entry
|
||||
- an isolinux menu entry
|
||||
- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
|
||||
- a binary hook copying files into `binary/boot/`
|
||||
- extraction from the cached `memtest86+` `.deb`
|
||||
- another deterministic build-time copy step
|
||||
- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
|
||||
|
||||
Current implementation direction:
|
||||
|
||||
- keep the live-build memtest stage enabled if it helps package acquisition
|
||||
- do not rely on the current early `binary_hooks` timing for final patching
|
||||
- prefer a post-`lb build` recovery step in `build.sh` that:
|
||||
- patches the fully materialized `LB_DIR/binary` tree
|
||||
- injects memtest binaries there
|
||||
- ensures final bootloader entries there
|
||||
- reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
|
||||
- also treat ISO validation tooling as part of the critical path:
|
||||
- install a stable ISO reader in the builder image
|
||||
- fail with an explicit reader error if ISO listing/extraction fails
|
||||
- do not treat reader failure as evidence that memtest is missing
|
||||
- do not call a probe that may return "needs recovery" as a bare command under
|
||||
`set -e`; wrap it in explicit control flow
|
||||
|
||||
## Consequences
|
||||
|
||||
- Future memtest changes must begin by reading this ADR and the commits listed above.
|
||||
- Future memtest changes must also begin by reading the failed-attempt list above.
|
||||
- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
|
||||
- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
|
||||
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||
"missing memtest" warning without a successful ISO read is not evidence.
|
||||
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||
|
||||
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||
|
||||
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||
|
||||
### Components
|
||||
|
||||
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||
|
||||
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||
those files may not exist yet. Instead:
|
||||
|
||||
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||
If they do not exist, the hook warns and continues (does not fail).
|
||||
|
||||
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||
|
||||
**2. Post-`lb build` recovery step in `build.sh`**
|
||||
|
||||
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||
contains all required memtest artifacts. If not:
|
||||
|
||||
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||
the ISO with the patched tree.
|
||||
|
||||
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||
|
||||
**3. ISO validation hardening**
|
||||
|
||||
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||
handled — it does not abort the build prematurely.
|
||||
|
||||
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||
|
||||
### Why this works when earlier attempts did not
|
||||
|
||||
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||
|
||||
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||
There is no ordering dependency to get wrong.
|
||||
|
||||
### Do not revert
|
||||
|
||||
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||
live-build alone produces all four required artifacts:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- memtest entry in `boot/grub/grub.cfg`
|
||||
- memtest entry in `isolinux/live.cfg`
|
||||
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
| Date | Decision | Status |
|
||||
|---|---|---|
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
|
||||
277
bible-local/docs/benchmark-clock-calibration.md
Normal file
277
bible-local/docs/benchmark-clock-calibration.md
Normal file
@@ -0,0 +1,277 @@
|
||||
# Benchmark clock calibration research
|
||||
|
||||
## Benchmark methodology versioning
|
||||
|
||||
Every benchmark methodology change must bump the benchmark version constant in
|
||||
source code by exactly `+1`.
|
||||
|
||||
Methodology change means any change that affects comparability of benchmark
|
||||
results, including for example:
|
||||
- phase durations or phase order
|
||||
- enabled/disabled precisions
|
||||
- fallback rules
|
||||
- normalization rules
|
||||
- score formulas or weights
|
||||
- degradation thresholds
|
||||
- power calibration logic
|
||||
- thermal/power penalty logic
|
||||
|
||||
Requirements:
|
||||
- benchmark version must be stored in source code as an explicit version
|
||||
constant, not inferred from git tag or build metadata
|
||||
- benchmark report must always print the benchmark version
|
||||
- `result.json` must always include the benchmark version
|
||||
- results from different benchmark versions must be treated as non-comparable by
|
||||
default
|
||||
|
||||
Purpose:
|
||||
- prevent accidental comparison of runs produced by different methodologies
|
||||
- make historical benchmark archives self-describing even when detached from git
|
||||
- force deliberate version bumps whenever scoring or execution semantics change
|
||||
|
||||
## Status
|
||||
In progress. Baseline data from production servers pending.
|
||||
|
||||
## Background
|
||||
|
||||
The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
|
||||
before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
|
||||
`avg_steady_clock < locked_target * 0.90`.
|
||||
|
||||
Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
|
||||
even a healthy GPU in a non-ideal server will sustain clocks well below boost.
|
||||
The 90% threshold has no empirical basis.
|
||||
|
||||
## Key observations (2026-04-06)
|
||||
|
||||
### H100 PCIe — new card, server not designed for it
|
||||
- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
|
||||
- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
|
||||
- Stability: 70.0 — clocks erratic, no equilibrium found
|
||||
- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
|
||||
|
||||
### H200 NVL — new card, server not designed for it
|
||||
- avg clock = P95 = 1635 MHz (perfectly stable)
|
||||
- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
|
||||
- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
|
||||
- Degradation: power_capped, thermal_limited
|
||||
- Compute: 989 TOPS — card is computing correctly for its frequency
|
||||
|
||||
### Key insight
|
||||
The meaningful distinction is not *whether* the card throttles but *how stably*
|
||||
it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
|
||||
H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
|
||||
instability may reflect a more severe thermal mismatch or a card issue.
|
||||
|
||||
`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
|
||||
`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
|
||||
|
||||
## Hypothesis for baseline
|
||||
|
||||
After testing on servers designed for their GPUs (proper cooling):
|
||||
- Healthy GPU under sustained load will run at a stable fraction of boost
|
||||
- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
|
||||
- Base clock (`clocks.base.gr`) may be a better reference than boost:
|
||||
a healthy card under real workload should comfortably exceed base clock
|
||||
|
||||
## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
|
||||
|
||||
Source: external stress test tool, ~90s runs, designed server, adequate power.
|
||||
|
||||
### Healthy fingerprint
|
||||
|
||||
- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
|
||||
- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
|
||||
- Avg steady (visual): **~1580–1620 MHz**
|
||||
- vs boost 1755 MHz: **~91–92%**
|
||||
- Oscillation is NORMAL — this is the boost algorithm balancing under power cap
|
||||
- Stable power + oscillating clocks = healthy power-cap behavior
|
||||
- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
|
||||
- **Consistency**: all 10 samples within ±20 MHz — very repeatable
|
||||
|
||||
### Characteristic patten
|
||||
Flat power line + oscillating/declining clock line = GPU correctly managed by
|
||||
power cap algorithm. Do NOT flag this as instability.
|
||||
|
||||
### Clock CV implication
|
||||
The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
|
||||
The current `variance_too_high` threshold (StabilityScore < 85) may fire on
|
||||
healthy HBM2e PCIe cards. Needs recalibration.
|
||||
|
||||
---
|
||||
|
||||
## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
|
||||
|
||||
Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
|
||||
Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
|
||||
|
||||
### GPU clock reference (from nvidia-smi, idle):
|
||||
- base_clock_mhz: **1095**
|
||||
- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
|
||||
- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
|
||||
- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
|
||||
|
||||
### Observed under 700W sustained load (both samples nearly identical):
|
||||
- Power: ~700W flat — SXM slot, adequate power confirmed
|
||||
- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
|
||||
- vs 1980 MHz (lock target): **72–74%** — severely below
|
||||
- vs 1755 MHz (nvidia-smi boost): **81–83%**
|
||||
- vs 1095 MHz (base): 130% — above base but far below expected for SXM
|
||||
- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
|
||||
- Temperature: 38°C → 79–80°C (same rate as HBM2e)
|
||||
- Oscillation: present, similar character to HBM2e but at much lower frequency
|
||||
|
||||
### Diagnosis
|
||||
These restored cards are degraded. A healthy H100 SXM in a designed server
|
||||
(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
|
||||
The 72–74% result is a clear signal of silicon or VRM degradation from the
|
||||
refurbishment process.
|
||||
|
||||
### Clock pattern note
|
||||
Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
|
||||
to images 19/20. Both sample sets show same degraded pattern — same batch.
|
||||
|
||||
---
|
||||
|
||||
## Baseline matrix (filled where data available)
|
||||
|
||||
| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
|
||||
|---|---|---|---|---|---|
|
||||
| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
|
||||
| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
|
||||
| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
|
||||
| H200 NVL | designed | TBD | TBD | TBD | need baseline |
|
||||
|
||||
---
|
||||
|
||||
## H100 official spec (from NVIDIA datasheet)
|
||||
|
||||
Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
|
||||
All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
|
||||
|
||||
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||
|---|---|---|---|---|---|
|
||||
| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
|
||||
| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
|
||||
| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
|
||||
| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
|
||||
|
||||
Notes:
|
||||
- SXM boards do NOT list FP8 peak in this table (field empty)
|
||||
- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
|
||||
- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
|
||||
|
||||
## Observed efficiency (H100 80GB PCIe, throttled server)
|
||||
|
||||
From the report in this session (power+thermal throttle throughout steady):
|
||||
|
||||
| Precision | Measured | Spec (dense) | % of spec |
|
||||
|---|---|---|---|
|
||||
| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
|
||||
| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
|
||||
| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
|
||||
|
||||
33–44% of spec is expected given sustained power+thermal throttle (avg clock
|
||||
1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
|
||||
actual frequency — the low TOPS comes from throttle, not silicon defect.
|
||||
|
||||
## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
|
||||
|
||||
Format: without sparsity / with sparsity.
|
||||
|
||||
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||
|---|---|---|---|---|---|
|
||||
| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
|
||||
| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
|
||||
|
||||
## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
|
||||
|
||||
Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
|
||||
|
||||
| Precision | Measured | Spec (dense) | % of spec |
|
||||
|---|---|---|---|
|
||||
| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
|
||||
| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
|
||||
| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
|
||||
|
||||
Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
|
||||
both are throttle-limited. Confirms that % of spec is not a quality signal,
|
||||
it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
|
||||
|
||||
## Real-world GEMM efficiency reference (2026-04-06, web research)
|
||||
|
||||
Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
|
||||
worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
|
||||
|
||||
### What healthy systems actually achieve:
|
||||
- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
|
||||
- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
|
||||
- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
|
||||
|
||||
### Our results vs expectation:
|
||||
| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
|
||||
|---|---|---|---|---|
|
||||
| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
|
||||
| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
|
||||
|
||||
Our results are roughly **half** of what a healthy system achieves even under throttle.
|
||||
This is NOT normal — 30-44% is not the industry baseline.
|
||||
|
||||
### Likely causes of the gap (in order of probability):
|
||||
1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
|
||||
2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
|
||||
Previous user may have set a lower limit via nvidia-smi -pl and it was not
|
||||
reset. Our normalization sets clock locks but does NOT reset power limit.
|
||||
Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
|
||||
3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
|
||||
8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
|
||||
|
||||
### Power limit gap analysis (H100 PCIe):
|
||||
- Avg clock 1384 MHz = 79% of boost 1755 MHz
|
||||
- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
|
||||
- Actually measured: 329 TOPS = 55% of that estimate
|
||||
- Remaining gap after accounting for clock throttle: ~45%
|
||||
- Most likely explanation: enforced power limit < 350W TDP, further reducing
|
||||
sustainable clock beyond what sw_thermal alone would cause.
|
||||
|
||||
### Action item:
|
||||
Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
|
||||
so result.json shows if the card was pre-configured with a non-default limit.
|
||||
If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
|
||||
|
||||
### CPU/RAM impact on GPU FLOPS:
|
||||
None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
|
||||
CPU core count and host RAM are irrelevant.
|
||||
|
||||
## Compute efficiency metric (proposed, no hardcode)
|
||||
|
||||
Instead of comparing TOPS to a hardcoded spec, compute:
|
||||
tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
|
||||
|
||||
This is model-agnostic. A GPU computing correctly at its actual frequency
|
||||
will show a consistent tops_per_sm_per_ghz regardless of throttle level.
|
||||
A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
|
||||
normal clocks.
|
||||
|
||||
SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
|
||||
(needs to be added to queryBenchmarkGPUInfo).
|
||||
|
||||
Reference values to establish after baseline runs:
|
||||
- H100 PCIe fp16_tensor: TBD tops/SM/GHz
|
||||
- H100 SXM fp16_tensor: TBD tops/SM/GHz
|
||||
|
||||
## Proposed threshold changes (pending more data)
|
||||
|
||||
1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
|
||||
91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
|
||||
capture the root cause.
|
||||
|
||||
2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
|
||||
under power cap. Consider suppressing this flag when power is flat and usage
|
||||
is 100% (oscillation is expected). Or lower threshold to 70.
|
||||
|
||||
3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
|
||||
ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
|
||||
would have been caught by this).
|
||||
|
||||
Decision deferred until baseline on SXM designed servers collected.
|
||||
121
bible-local/docs/gpu-model-propagation.md
Normal file
121
bible-local/docs/gpu-model-propagation.md
Normal file
@@ -0,0 +1,121 @@
|
||||
# GPU Model Name Propagation
|
||||
|
||||
How GPU model names are detected, stored, and displayed throughout the project.
|
||||
|
||||
---
|
||||
|
||||
## Detection Sources
|
||||
|
||||
There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
|
||||
|
||||
### Pipeline A — Live / SAT (nvidia-smi query at runtime)
|
||||
|
||||
**File:** `audit/internal/platform/sat.go`
|
||||
|
||||
- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
|
||||
- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
|
||||
- Used by: GPU selection UI, live metrics labels, burn/stress test logic
|
||||
|
||||
### Pipeline B — Benchmark results
|
||||
|
||||
**File:** `audit/internal/platform/benchmark.go`, line 124
|
||||
|
||||
- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
|
||||
- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
|
||||
- Used by: benchmark history table, benchmark report
|
||||
|
||||
### Pipeline C — Hardware audit JSON (PCIe schema)
|
||||
|
||||
**File:** `audit/internal/schema/hardware.go`
|
||||
|
||||
- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
|
||||
- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
|
||||
- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
|
||||
- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
|
||||
|
||||
---
|
||||
|
||||
## Key Inconsistency: NVIDIA PCIe Model is Never Set
|
||||
|
||||
`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
|
||||
|
||||
This means:
|
||||
- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
|
||||
- AMD GPUs do have their model populated
|
||||
|
||||
The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
|
||||
|
||||
---
|
||||
|
||||
## Benchmark History "Unknown GPU" Issue
|
||||
|
||||
**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
|
||||
|
||||
**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
|
||||
|
||||
This happens for:
|
||||
- Older result files saved before the `Name` field was added
|
||||
- Runs where nvidia-smi query failed before the benchmark started
|
||||
|
||||
---
|
||||
|
||||
## Fallback Strings — Current State
|
||||
|
||||
| Location | File | Fallback string |
|
||||
|---|---|---|
|
||||
| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
|
||||
| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
|
||||
| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
|
||||
| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
|
||||
| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
|
||||
| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
|
||||
| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
|
||||
| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
|
||||
|
||||
**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
|
||||
|
||||
---
|
||||
|
||||
## GPU Selection UI
|
||||
|
||||
**File:** `audit/internal/webui/pages.go`
|
||||
|
||||
- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
|
||||
- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
|
||||
- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
|
||||
|
||||
This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
|
||||
|
||||
---
|
||||
|
||||
## Data Flow Summary
|
||||
|
||||
```
|
||||
nvidia-smi (live)
|
||||
└─ ListNvidiaGPUs() → NvidiaGPU.Name
|
||||
├─ GPU selection UI (always correct)
|
||||
├─ Live metrics labels (charts_svg.go)
|
||||
└─ SAT/burn status file (sat.go)
|
||||
|
||||
nvidia-smi (at benchmark start)
|
||||
└─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
|
||||
└─ BenchmarkGPUResult.Name (json:"name,omitempty")
|
||||
├─ Benchmark report
|
||||
└─ Benchmark history table columns
|
||||
|
||||
nvidia-smi / lspci (audit collection)
|
||||
└─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
|
||||
└─ Hardware summary page hwDescribeGPU()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Fixed Issues
|
||||
|
||||
All previously open items are resolved:
|
||||
|
||||
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
|
||||
2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
|
||||
3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
|
||||
4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
|
||||
5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
|
||||
@@ -13,9 +13,85 @@ Use one of:
|
||||
|
||||
This applies to:
|
||||
- `iso/builder/config/package-lists/*.list.chroot`
|
||||
- Any package referenced in `grub.cfg`, hooks, or overlay scripts (e.g. file paths like `/boot/memtest86+x64.bin`)
|
||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||
|
||||
## Example of what goes wrong without this
|
||||
## Bootloader sync rule
|
||||
|
||||
`memtest86+` in Debian bookworm installs `/boot/memtest86+x64.bin`, not `/boot/memtest86+.bin`.
|
||||
Guessing the filename caused a broken GRUB entry that only surfaced at boot time, after a full rebuild.
|
||||
The ISO has two independent bootloader configs that must be kept in sync manually:
|
||||
|
||||
| File | Used by |
|
||||
|------|---------|
|
||||
| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
|
||||
| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
|
||||
|
||||
live-build does NOT derive one from the other. Any new boot entry, kernel parameter
|
||||
change, or new mode added to one file must be manually mirrored in the other.
|
||||
|
||||
**Canonical entry list** (both files must have all of these):
|
||||
|
||||
| Label | Key params |
|
||||
|-------|-----------|
|
||||
| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
|
||||
| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
|
||||
| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
|
||||
| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
|
||||
| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
|
||||
| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
|
||||
|
||||
**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
|
||||
```
|
||||
net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
|
||||
numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
nowatchdog nosoftlockup
|
||||
```
|
||||
(fail-safe is the exception — it deliberately uses minimal params.)
|
||||
|
||||
**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
|
||||
live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
|
||||
configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
|
||||
|
||||
## Memtest rule
|
||||
|
||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||
We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
|
||||
ran, but the final ISO still lacked memtest binaries and menu entries.
|
||||
|
||||
For this project, memtest is accepted only when the produced ISO actually
|
||||
contains all of the following:
|
||||
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a memtest entry in `boot/grub/grub.cfg`
|
||||
- a memtest entry in `isolinux/live.cfg`
|
||||
|
||||
Rules:
|
||||
|
||||
- Keep explicit post-build memtest validation in `build.sh`.
|
||||
- Treat ISO reader success as a separate prerequisite from memtest content.
|
||||
If the reader cannot list or extract from the ISO, that is a validator
|
||||
failure, not proof that memtest is missing.
|
||||
- If built-in integration does not produce the artifacts above, use a
|
||||
deterministic project-owned copy/extract step instead of hoping live-build
|
||||
will "start working".
|
||||
- Do not switch back to built-in-only memtest without fresh build evidence from
|
||||
a real ISO.
|
||||
- If you reference memtest files manually, verify the exact package file list
|
||||
first for the target Debian release.
|
||||
|
||||
Known bad loops for this repository:
|
||||
|
||||
- Do not retry built-in-only memtest without new evidence. We already proved
|
||||
that `lb binary_memtest` can run while the final ISO still has no memtest.
|
||||
- Do not assume fixing memtest file names is enough. Correct names did not fix
|
||||
the final artifact path.
|
||||
- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
|
||||
- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
|
||||
bootloader configs.
|
||||
- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
|
||||
timing is late enough to patch final `binary/boot/grub/grub.cfg` or
|
||||
`binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
|
||||
present yet when the hook executed.
|
||||
- Do not treat a validator warning as ground truth until you have confirmed the
|
||||
ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
|
||||
regression because the final ISO was correct but the validator produced a
|
||||
false negative.
|
||||
|
||||
Submodule internal/chart updated: 05db6994d4...ac8120c8ab
@@ -17,6 +17,7 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
wget \
|
||||
curl \
|
||||
tar \
|
||||
libarchive-tools \
|
||||
xz-utils \
|
||||
rsync \
|
||||
build-essential \
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
DEBIAN_VERSION=12
|
||||
DEBIAN_KERNEL_ABI=auto
|
||||
NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUBLAS_VERSION=13.1.1.3-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.2-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
MEMTEST_VERSION=6.10-4
|
||||
|
||||
@@ -23,16 +23,17 @@ lb config noauto \
|
||||
--bootloaders "grub-efi,syslinux" \
|
||||
--debian-installer none \
|
||||
--archive-areas "main contrib non-free non-free-firmware" \
|
||||
--mirror-bootstrap "https://deb.debian.org/debian" \
|
||||
--mirror-chroot "https://deb.debian.org/debian" \
|
||||
--mirror-binary "https://deb.debian.org/debian" \
|
||||
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-chroot "http://mirror.mephi.ru/debian/" \
|
||||
--mirror-binary "http://mirror.mephi.ru/debian/" \
|
||||
--security true \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest none \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--debootstrap-options "--include=ca-certificates" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -33,9 +33,10 @@ typedef void *CUstream;
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||
#define MAX_STRESS_STREAMS 16
|
||||
#define MAX_CUBLAS_PROFILES 5
|
||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||
#define MAX_SINGLE_PRECISION_STREAMS 4
|
||||
#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
@@ -297,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
|
||||
return stream_count;
|
||||
}
|
||||
|
||||
static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
|
||||
if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
|
||||
return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
return profile_budget_bytes;
|
||||
}
|
||||
|
||||
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||
if (!api->cuStreamDestroy) {
|
||||
return;
|
||||
@@ -343,7 +351,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
unsigned long iterations = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int launches_per_wave = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||
@@ -418,10 +425,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
|
||||
unsigned int threads = 256;
|
||||
|
||||
double start = now_seconds();
|
||||
double deadline = start + (double)seconds;
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
launches_per_wave = 0;
|
||||
int launched = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
@@ -439,16 +446,21 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launches_per_wave++;
|
||||
launched++;
|
||||
iterations++;
|
||||
}
|
||||
if (launches_per_wave <= 0) {
|
||||
if (launched <= 0) {
|
||||
goto fail;
|
||||
}
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
}
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
iterations += (unsigned long)launches_per_wave;
|
||||
}
|
||||
api->cuCtxSynchronize();
|
||||
|
||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||
goto fail;
|
||||
@@ -597,6 +609,20 @@ struct prepared_profile {
|
||||
};
|
||||
|
||||
static const struct profile_desc k_profiles[] = {
|
||||
{
|
||||
"fp64",
|
||||
"fp64",
|
||||
80,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
8,
|
||||
CUDA_R_64F,
|
||||
CUDA_R_64F,
|
||||
CUDA_R_64F,
|
||||
CUDA_R_64F,
|
||||
CUBLAS_COMPUTE_64F,
|
||||
},
|
||||
{
|
||||
"fp32_tf32",
|
||||
"fp32",
|
||||
@@ -625,6 +651,20 @@ static const struct profile_desc k_profiles[] = {
|
||||
CUDA_R_16F,
|
||||
CUBLAS_COMPUTE_32F_FAST_16F,
|
||||
},
|
||||
{
|
||||
"int8_tensor",
|
||||
"int8",
|
||||
75,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
128,
|
||||
CUDA_R_8I,
|
||||
CUDA_R_8I,
|
||||
CUDA_R_32I,
|
||||
CUDA_R_32I,
|
||||
CUBLAS_COMPUTE_32I,
|
||||
},
|
||||
{
|
||||
"fp8_e4m3",
|
||||
"fp8",
|
||||
@@ -671,6 +711,21 @@ static const struct profile_desc k_profiles[] = {
|
||||
#endif
|
||||
};
|
||||
|
||||
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
||||
|
||||
static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
|
||||
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||
return 0;
|
||||
}
|
||||
if (precision_filter != NULL) {
|
||||
return strcmp(desc->block_label, precision_filter) == 0;
|
||||
}
|
||||
/* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
|
||||
* unstable on the current benchmark fleet and can abort the whole mixed
|
||||
* pass after earlier phases already collected useful telemetry. */
|
||||
return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
|
||||
}
|
||||
|
||||
static int load_cublaslt(struct cublaslt_api *api) {
|
||||
memset(api, 0, sizeof(*api));
|
||||
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
||||
@@ -741,10 +796,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
|
||||
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||
switch (type) {
|
||||
case CUDA_R_32F:
|
||||
case CUDA_R_32I:
|
||||
return (size_t)(elements * 4u);
|
||||
case CUDA_R_16F:
|
||||
case CUDA_R_16BF:
|
||||
return (size_t)(elements * 2u);
|
||||
case CUDA_R_8I:
|
||||
case CUDA_R_8F_E4M3:
|
||||
case CUDA_R_8F_E5M2:
|
||||
return (size_t)(elements);
|
||||
@@ -757,6 +814,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||
}
|
||||
}
|
||||
|
||||
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
|
||||
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
|
||||
return CUDA_R_32I;
|
||||
}
|
||||
if (desc->compute_type == CUBLAS_COMPUTE_64F) {
|
||||
return CUDA_R_64F;
|
||||
}
|
||||
return CUDA_R_32F;
|
||||
}
|
||||
|
||||
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
||||
uint64_t row_tiles = (rows + 127u) / 128u;
|
||||
uint64_t col_tiles = (cols + 63u) / 64u;
|
||||
@@ -863,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
CUstream stream,
|
||||
size_t profile_budget_bytes,
|
||||
struct prepared_profile *out) {
|
||||
memset(out, 0, sizeof(*out));
|
||||
out->desc = *desc;
|
||||
out->stream = stream;
|
||||
|
||||
size_t bytes_per_cell = 0;
|
||||
size_t attempt_budget = profile_budget_bytes;
|
||||
|
||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
||||
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
||||
@@ -876,105 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
|
||||
out->m = dim;
|
||||
out->n = dim;
|
||||
out->k = dim;
|
||||
while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
|
||||
memset(out, 0, sizeof(*out));
|
||||
out->desc = *desc;
|
||||
out->stream = stream;
|
||||
|
||||
size_t desired_workspace = profile_budget_bytes / 8u;
|
||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||
desired_workspace = 32u * 1024u * 1024u;
|
||||
}
|
||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||
uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
|
||||
out->m = dim;
|
||||
out->n = dim;
|
||||
out->k = dim;
|
||||
|
||||
size_t a_bytes = 0;
|
||||
size_t b_bytes = 0;
|
||||
size_t c_bytes = 0;
|
||||
size_t d_bytes = 0;
|
||||
size_t scale_bytes = 0;
|
||||
while (1) {
|
||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||
size_t desired_workspace = attempt_budget / 8u;
|
||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||
desired_workspace = 32u * 1024u * 1024u;
|
||||
}
|
||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||
|
||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||
if (matrix_bytes <= profile_budget_bytes) {
|
||||
size_t remaining = profile_budget_bytes - matrix_bytes;
|
||||
out->workspace_size = desired_workspace;
|
||||
if (out->workspace_size > remaining) {
|
||||
out->workspace_size = round_down_size(remaining, 256u);
|
||||
size_t a_bytes = 0;
|
||||
size_t b_bytes = 0;
|
||||
size_t c_bytes = 0;
|
||||
size_t d_bytes = 0;
|
||||
size_t scale_bytes = 0;
|
||||
while (1) {
|
||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||
|
||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||
if (matrix_bytes <= attempt_budget) {
|
||||
size_t remaining = attempt_budget - matrix_bytes;
|
||||
out->workspace_size = desired_workspace;
|
||||
if (out->workspace_size > remaining) {
|
||||
out->workspace_size = round_down_size(remaining, 256u);
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||
break;
|
||||
}
|
||||
out->m -= (uint64_t)desc->min_multiple;
|
||||
out->n = out->m;
|
||||
out->k = out->m;
|
||||
}
|
||||
if (out->m < (uint64_t)desc->min_multiple) {
|
||||
attempt_budget /= 2u;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||
return 0;
|
||||
}
|
||||
out->m -= (uint64_t)desc->min_multiple;
|
||||
out->n = out->m;
|
||||
out->k = out->m;
|
||||
}
|
||||
|
||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
if (!check_cublas("set TRANSA",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||
&transa,
|
||||
sizeof(transa))) ||
|
||||
!check_cublas("set TRANSB",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||
&transb,
|
||||
sizeof(transb)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (desc->needs_scalar_scale) {
|
||||
float one = 1.0f;
|
||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||
|
||||
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||
if (!check_cublas("set A scale ptr",
|
||||
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
if (!check_cublas("set TRANSA",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||
&a_scale_ptr,
|
||||
sizeof(a_scale_ptr))) ||
|
||||
!check_cublas("set B scale ptr",
|
||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||
&transa,
|
||||
sizeof(transa))) ||
|
||||
!check_cublas("set TRANSB",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||
&b_scale_ptr,
|
||||
sizeof(b_scale_ptr)))) {
|
||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||
&transb,
|
||||
sizeof(transb)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (desc->needs_scalar_scale) {
|
||||
float one = 1.0f;
|
||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||
if (!check_cublas("set A scale ptr",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||
&a_scale_ptr,
|
||||
sizeof(a_scale_ptr))) ||
|
||||
!check_cublas("set B scale ptr",
|
||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||
&b_scale_ptr,
|
||||
sizeof(b_scale_ptr)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
||||
if (desc->needs_block_scale) {
|
||||
@@ -1014,78 +1089,94 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!check_cublas("create A layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||
!check_cublas("create B layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||
!check_cublas("create C layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||
!check_cublas("create D layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (out->workspace_size > 0) {
|
||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||
if (!check_cublas("create A layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||
!check_cublas("create B layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||
!check_cublas("create C layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||
!check_cublas("create D layout",
|
||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (out->workspace_size > 0) {
|
||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!check_cublas("set workspace",
|
||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||
out->preference,
|
||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||
&out->workspace_size,
|
||||
sizeof(out->workspace_size)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int found = 0;
|
||||
if (check_cublas("heuristic",
|
||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||
out->op_desc,
|
||||
out->a_layout,
|
||||
out->b_layout,
|
||||
out->c_layout,
|
||||
out->d_layout,
|
||||
out->preference,
|
||||
1,
|
||||
&out->heuristic,
|
||||
&found)) &&
|
||||
found > 0) {
|
||||
out->ready = 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
destroy_profile(cublas, cuda, out);
|
||||
attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
|
||||
if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!check_cublas("set workspace",
|
||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||
out->preference,
|
||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||
&out->workspace_size,
|
||||
sizeof(out->workspace_size)))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int found = 0;
|
||||
if (!check_cublas("heuristic",
|
||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||
out->op_desc,
|
||||
out->a_layout,
|
||||
out->b_layout,
|
||||
out->c_layout,
|
||||
out->d_layout,
|
||||
out->preference,
|
||||
1,
|
||||
&out->heuristic,
|
||||
&found))) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
if (found <= 0) {
|
||||
destroy_profile(cublas, cuda, out);
|
||||
return 0;
|
||||
}
|
||||
|
||||
out->ready = 1;
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||
struct cublaslt_api *cublas,
|
||||
struct prepared_profile *profile) {
|
||||
int32_t alpha_i32 = 1;
|
||||
int32_t beta_i32 = 0;
|
||||
double alpha_f64 = 1.0;
|
||||
double beta_f64 = 0.0;
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
const void *alpha_ptr = α
|
||||
const void *beta_ptr = β
|
||||
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
|
||||
alpha_ptr = &alpha_i32;
|
||||
beta_ptr = &beta_i32;
|
||||
} else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
|
||||
alpha_ptr = &alpha_f64;
|
||||
beta_ptr = &beta_f64;
|
||||
}
|
||||
return check_cublas(profile->desc.name,
|
||||
cublas->cublasLtMatmul(handle,
|
||||
profile->op_desc,
|
||||
&alpha,
|
||||
alpha_ptr,
|
||||
(const void *)(uintptr_t)profile->a_dev,
|
||||
profile->a_layout,
|
||||
(const void *)(uintptr_t)profile->b_dev,
|
||||
profile->b_layout,
|
||||
&beta,
|
||||
beta_ptr,
|
||||
(const void *)(uintptr_t)profile->c_dev,
|
||||
profile->c_layout,
|
||||
(void *)(uintptr_t)profile->d_dev,
|
||||
@@ -1103,9 +1194,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int cc_minor,
|
||||
int seconds,
|
||||
int size_mb,
|
||||
const char *precision_filter,
|
||||
struct stress_report *report) {
|
||||
struct cublaslt_api cublas;
|
||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
|
||||
cublasLtHandle_t handle = NULL;
|
||||
CUcontext ctx = NULL;
|
||||
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||
@@ -1115,12 +1207,12 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int active = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||
int profile_count = PROFILE_COUNT;
|
||||
int prepared_count = 0;
|
||||
int wave_launches = 0;
|
||||
size_t requested_budget = 0;
|
||||
size_t total_budget = 0;
|
||||
size_t per_profile_budget = 0;
|
||||
int budget_profiles = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||
@@ -1141,8 +1233,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count profiles matching the filter (for deciding what to run). */
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||
planned++;
|
||||
}
|
||||
}
|
||||
@@ -1153,18 +1246,42 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count all profiles active on this GPU regardless of filter.
|
||||
* Mixed phases still divide budget across the full precision set, while
|
||||
* single-precision benchmark phases dedicate budget only to active
|
||||
* profiles matching precision_filter. */
|
||||
int planned_total = 0;
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||
planned_total++;
|
||||
}
|
||||
}
|
||||
if (planned_total < planned) {
|
||||
planned_total = planned;
|
||||
}
|
||||
budget_profiles = planned_total;
|
||||
if (precision_filter != NULL) {
|
||||
budget_profiles = planned;
|
||||
}
|
||||
if (budget_profiles <= 0) {
|
||||
budget_profiles = planned_total;
|
||||
}
|
||||
|
||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||
cuda->cuStreamCreate &&
|
||||
cuda->cuStreamDestroy) {
|
||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||
stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
|
||||
}
|
||||
if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
|
||||
stream_count = MAX_SINGLE_PRECISION_STREAMS;
|
||||
}
|
||||
if (stream_count > 1) {
|
||||
int created = 0;
|
||||
@@ -1177,18 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
}
|
||||
report->stream_count = stream_count;
|
||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||
per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
|
||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (precision_filter != NULL) {
|
||||
per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
|
||||
}
|
||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
mp_count,
|
||||
budget_profiles,
|
||||
per_profile_budget / (1024u * 1024u));
|
||||
|
||||
for (int i = 0; i < profile_count; i++) {
|
||||
@@ -1201,6 +1322,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
desc->min_cc);
|
||||
continue;
|
||||
}
|
||||
if (!profile_allowed_for_run(desc, cc, precision_filter)) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=SKIPPED benchmark_disabled\n",
|
||||
desc->name);
|
||||
continue;
|
||||
}
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
CUstream stream = streams[lane];
|
||||
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||
@@ -1236,9 +1364,15 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Keep the GPU queue continuously full by submitting kernels without
|
||||
* synchronizing after every wave. A sync barrier after each small batch
|
||||
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||
* especially when individual kernels are short. Instead we sync at most
|
||||
* once per second (for error detection) and once at the very end. */
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
wave_launches = 0;
|
||||
int launched = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
@@ -1258,21 +1392,27 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
wave_launches++;
|
||||
launched++;
|
||||
}
|
||||
if (wave_launches <= 0) {
|
||||
if (launched <= 0) {
|
||||
break;
|
||||
}
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
}
|
||||
/* Final drain — ensure all queued work finishes before we read results. */
|
||||
cuda->cuCtxSynchronize();
|
||||
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
@@ -1306,10 +1446,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
#endif
|
||||
|
||||
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
|
||||
printf("device=%s\n", report->device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
|
||||
printf("backend=%s\n", report->backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
printf("buffer_mb=%d\n", report->buffer_mb);
|
||||
printf("streams=%d\n", report->stream_count);
|
||||
printf("iterations=%lu\n", report->iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)report->checksum);
|
||||
if (report->details[0] != '\0') {
|
||||
printf("%s", report->details);
|
||||
}
|
||||
printf("status=OK\n");
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
int device_index = 0;
|
||||
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
||||
const char *precision_plan = NULL;
|
||||
const char *precision_plan_seconds = NULL;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
@@ -1317,8 +1476,16 @@ int main(int argc, char **argv) {
|
||||
size_mb = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||
device_index = atoi(argv[++i]);
|
||||
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
||||
precision_filter = argv[++i];
|
||||
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
|
||||
precision_plan = argv[++i];
|
||||
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
|
||||
precision_plan_seconds = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||
fprintf(stderr,
|
||||
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
|
||||
argv[0]);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
@@ -1378,26 +1545,94 @@ int main(int argc, char **argv) {
|
||||
int ok = 0;
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
|
||||
if (precision_plan != NULL && precision_plan[0] != '\0') {
|
||||
char *plan_copy = strdup(precision_plan);
|
||||
char *plan_seconds_copy = NULL;
|
||||
int phase_seconds[32] = {0};
|
||||
int phase_seconds_count = 0;
|
||||
int phase_ok = 0;
|
||||
if (plan_copy == NULL) {
|
||||
fprintf(stderr, "failed to allocate precision plan buffer\n");
|
||||
return 1;
|
||||
}
|
||||
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
|
||||
plan_seconds_copy = strdup(precision_plan_seconds);
|
||||
if (plan_seconds_copy == NULL) {
|
||||
free(plan_copy);
|
||||
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
|
||||
return 1;
|
||||
}
|
||||
for (char *sec_token = strtok(plan_seconds_copy, ",");
|
||||
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
|
||||
sec_token = strtok(NULL, ",")) {
|
||||
while (*sec_token == ' ' || *sec_token == '\t') {
|
||||
sec_token++;
|
||||
}
|
||||
if (*sec_token == '\0') {
|
||||
continue;
|
||||
}
|
||||
phase_seconds[phase_seconds_count++] = atoi(sec_token);
|
||||
}
|
||||
}
|
||||
int phase_idx = 0;
|
||||
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
|
||||
while (*token == ' ' || *token == '\t') {
|
||||
token++;
|
||||
}
|
||||
if (*token == '\0') {
|
||||
continue;
|
||||
}
|
||||
const char *phase_name = token;
|
||||
const char *phase_filter = token;
|
||||
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
|
||||
phase_filter = NULL;
|
||||
}
|
||||
int phase_duration = seconds;
|
||||
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
|
||||
phase_duration = phase_seconds[phase_idx];
|
||||
}
|
||||
printf("phase_begin=%s\n", phase_name);
|
||||
fflush(stdout);
|
||||
memset(&report, 0, sizeof(report));
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
|
||||
if (ok) {
|
||||
print_stress_report(&report, device_index, phase_duration);
|
||||
phase_ok = 1;
|
||||
} else {
|
||||
printf("phase_error=%s\n", phase_name);
|
||||
if (report.details[0] != '\0') {
|
||||
printf("%s", report.details);
|
||||
if (report.details[strlen(report.details) - 1] != '\n') {
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
printf("status=FAILED\n");
|
||||
}
|
||||
printf("phase_end=%s\n", phase_name);
|
||||
fflush(stdout);
|
||||
}
|
||||
free(plan_seconds_copy);
|
||||
free(plan_copy);
|
||||
return phase_ok ? 0 : 1;
|
||||
}
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||
#endif
|
||||
if (!ok) {
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
||||
if (precision_filter != NULL) {
|
||||
fprintf(stderr,
|
||||
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
|
||||
precision_filter,
|
||||
name,
|
||||
cc_major,
|
||||
cc_minor);
|
||||
return 1;
|
||||
}
|
||||
int ptx_mb = size_mb;
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
printf("device=%s\n", report.device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||
printf("backend=%s\n", report.backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
||||
printf("streams=%d\n", report.stream_count);
|
||||
printf("iterations=%lu\n", report.iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
||||
if (report.details[0] != '\0') {
|
||||
printf("%s", report.details);
|
||||
}
|
||||
printf("status=OK\n");
|
||||
print_stress_report(&report, device_index, seconds);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
nvidia|nvidia-legacy|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
@@ -156,6 +161,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
@@ -170,6 +176,7 @@ run_variant() {
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-e BEE_REQUIRE_MEMTEST=1 \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
@@ -180,6 +187,9 @@ case "$VARIANT" in
|
||||
nvidia)
|
||||
run_variant nvidia
|
||||
;;
|
||||
nvidia-legacy)
|
||||
run_variant nvidia-legacy
|
||||
;;
|
||||
amd)
|
||||
run_variant amd
|
||||
;;
|
||||
@@ -188,6 +198,7 @@ case "$VARIANT" in
|
||||
;;
|
||||
all)
|
||||
run_variant nvidia
|
||||
run_variant nvidia-legacy
|
||||
run_variant amd
|
||||
run_variant nogpu
|
||||
;;
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
#!/bin/sh
|
||||
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
||||
# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
|
||||
#
|
||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
||||
# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
|
||||
# - open -> kernel-open/ sources from the .run installer
|
||||
# - proprietary -> traditional proprietary kernel sources from the .run installer
|
||||
#
|
||||
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
||||
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
||||
@@ -17,10 +19,19 @@ set -e
|
||||
NVIDIA_VERSION="$1"
|
||||
DIST_DIR="$2"
|
||||
DEBIAN_KERNEL_ABI="$3"
|
||||
NVIDIA_FLAVOR="${4:-open}"
|
||||
|
||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
|
||||
case "$NVIDIA_FLAVOR" in
|
||||
open|proprietary) ;;
|
||||
*)
|
||||
echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
# On Debian, kernel headers are split into two packages:
|
||||
@@ -31,7 +42,22 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||
|
||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
||||
echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
CACHE_LAYOUT_VERSION="3"
|
||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||
echo "=== installing linux-headers-${KVER} ==="
|
||||
@@ -42,18 +68,6 @@ fi
|
||||
echo "kernel headers (arch): $KDIR_ARCH"
|
||||
echo "kernel headers (common): $KDIR_COMMON"
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Download official NVIDIA .run installer with sha256 verification
|
||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||
@@ -87,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
||||
rm -rf "$EXTRACT_DIR"
|
||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||
|
||||
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
||||
# Find kernel source directory for the selected flavor.
|
||||
KERNEL_SRC=""
|
||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||
if [ "$NVIDIA_FLAVOR" = "open" ]; then
|
||||
for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
else
|
||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
fi
|
||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||
echo "kernel source: $KERNEL_SRC"
|
||||
|
||||
# Build kernel modules
|
||||
@@ -130,24 +150,30 @@ else
|
||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||
fi
|
||||
|
||||
# Copy ALL userspace library files.
|
||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||
# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
|
||||
# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
|
||||
# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
|
||||
# but still fail with "no OpenCL platforms" because one dependent .so is absent.
|
||||
copied_libs=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
|
||||
cp "$f" "$CACHE_DIR/lib/"
|
||||
copied_libs=$((copied_libs+1))
|
||||
done
|
||||
|
||||
if [ "$copied_libs" -eq 0 ]; then
|
||||
echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
done
|
||||
if [ "$count" -eq 0 ]; then
|
||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
||||
libnvidia-opencl; do
|
||||
if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
|
||||
echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
|
||||
ls "$CACHE_DIR/lib/" | sort >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
@@ -156,23 +182,17 @@ done
|
||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
# Create soname symlinks for every copied versioned library.
|
||||
for versioned in "$CACHE_DIR"/lib/*.so.*; do
|
||||
[ -f "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
||||
echo "${lib}: .so.1 -> $base"
|
||||
stem=${base%%.so.*}
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
|
||||
ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
|
||||
done
|
||||
|
||||
touch "$CACHE_LAYOUT_MARKER"
|
||||
|
||||
echo "=== NVIDIA build complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $ko_count .ko files"
|
||||
|
||||
1032
iso/builder/build.sh
1032
iso/builder/build.sh
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user