Compare commits
362 Commits
ab5a4be7ac
...
v8.30
| Author | SHA1 | Date | |
|---|---|---|---|
| f8cd9a7376 | |||
| d52ec67f8f | |||
| 61c7abaa80 | |||
| d60f7758ba | |||
| 52c3a24b76 | |||
| 028bb30333 | |||
| 7d64e5d215 | |||
| 51b721aeb3 | |||
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 | ||
|
|
e306250da7 | ||
|
|
c5b2081ac9 | ||
| 434528083e | |||
| 30aa30cd67 | |||
| 4f76e1de21 | |||
| 3732e64a4a | |||
| 0d925299ff | |||
| a8d5e019a5 | |||
| 72ec086568 | |||
| 7a0b0934df | |||
| d8ca0dca2c | |||
| d90250f80a | |||
| 8d6eaef5de | |||
| 732bf4cbab | |||
| fa6d905a10 | |||
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 | |||
| f87461ee4a | |||
| a636146dbd | |||
|
|
303de2df04 | ||
|
|
95124d228f | ||
|
|
54338dbae5 | ||
|
|
2be7ae6d28 | ||
|
|
b1a5035edd | ||
|
|
8fc986c933 | ||
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 | |||
| 0fb8f2777f | |||
| bf182daa89 | |||
| 457ea1cf04 | |||
| bf6ecab4f0 | |||
| 02e44b1172 | |||
| 2ceaa0d0ca | |||
| 9482ba20a2 | |||
| 813e2f86a9 | |||
| 58a6da9b44 | |||
| f4a19c0a00 | |||
| 9e3dcf9b4d | |||
| 098e19f760 | |||
| e16d0f34b5 | |||
|
|
525ed8b8fc | ||
|
|
4f94ebcb2c | ||
|
|
05c1fde233 | ||
| 825ef6b98a | |||
| ba16021cdb | |||
|
|
bb1218ddd4 | ||
|
|
65faae8ede | ||
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b | ||
|
|
3f41a026ca | ||
|
|
0ee4f46537 | ||
| 8db40b098a | |||
| 16e7ae00e7 | |||
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d | |||
|
|
0a4bb596f6 | ||
|
|
531d1ca366 | ||
|
|
93cfa78e8c | ||
|
|
1358485f2b | ||
| 8fe20ba678 | |||
| d973231f37 | |||
| f5d175f488 | |||
| fa00667750 | |||
|
|
c7d2816a7f | ||
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 | |||
| 5b9015451e | |||
| d1a6863ceb | |||
| f9aa05de8e | |||
| a9ccea8cca | |||
| fc5c985fb5 | |||
| 5eb3baddb4 | |||
| a6ac13b5d3 | |||
| 4003cb7676 | |||
| 2875313ba0 | |||
| f1621efee4 | |||
| 4461249cc3 | |||
| e609fbbc26 | |||
| cc2b49ea41 | |||
| 33e0a5bef2 | |||
| 38e79143eb | |||
| 25af2df23a | |||
| 20abff7f90 | |||
| a14ec8631c | |||
| f58c7e58d3 | |||
| bf47c8dbd2 | |||
| 143b7dca5d | |||
| 9826d437a5 | |||
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 | ||
|
|
fc7fe0b08e | ||
|
|
3cf75a541a | ||
|
|
1f750d3edd | ||
|
|
b2b0444131 | ||
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 | ||
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac | |||
| 3dbc2184ef | |||
| 60cb8f889a | |||
| c9ee078622 | |||
| ea660500c9 | |||
| d43a9aeec7 | |||
|
|
f5622e351e | ||
|
|
a20806afc8 | ||
|
|
4f9b6b3bcd | ||
|
|
c850b39b01 | ||
|
|
6dee8f3509 | ||
|
|
20f834aa96 | ||
| 105d92df8b | |||
| f96b149875 | |||
| 5ee120158e | |||
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 | |||
| 4db89e9773 | |||
| 3fda18f708 | |||
| ea518abf30 | |||
| 744de588bb | |||
| a3ed9473a3 | |||
| a714c45f10 | |||
| 349e026cfa | |||
| 889fe1dc2f | |||
| befdbf3768 | |||
| ec6a0b292d | |||
| a03312c286 | |||
| e69e9109da | |||
| 413869809d | |||
| f9bd38572a | |||
| 662e3d2cdd | |||
| 126af96780 | |||
| ada15ac777 | |||
| dfb94f9ca6 | |||
| 5857805518 | |||
| 59a1d4b209 | |||
| 0dbfaf6121 | |||
| 5d72d48714 | |||
| 096b4a09ca | |||
| 5d42a92e4c | |||
| 3e54763367 | |||
| f91bce8661 | |||
| 585e6d7311 | |||
| 0a98ed8ae9 | |||
| 911745e4da | |||
| acfd2010d7 | |||
| e904c13790 | |||
| 24c5c72cee | |||
| 6ff0bcad56 | |||
| 4fef26000c | |||
| a393dcb731 | |||
| 9e55728053 | |||
| 4b8023c1cb | |||
| 4c8417d20a | |||
| 0755374dd2 | |||
| c70ae274fa | |||
| 23ad7ff534 | |||
| de130966f7 | |||
| c6fbfc8306 | |||
| 35ad1c74d9 | |||
| 4a02e74b17 | |||
| cd2853ad99 | |||
| 6caf771d6e | |||
| 14fa87b7d7 | |||
| 600ece911b | |||
| 2d424c63cb | |||
| 50f28d1ee6 | |||
| 3579747ae3 | |||
| 09dc7d2613 | |||
| ec0b7f7ff9 | |||
| e7a7ff54b9 | |||
| b4371e291e | |||
| c22b53a406 | |||
| ff0acc3698 | |||
| d50760e7c6 | |||
| ed4f8be019 | |||
| 883592d029 | |||
| a6dcaf1c7e | |||
| 88727fb590 | |||
| c9f5224c42 | |||
| 7cb5c02a9b | |||
| c1aa3cf491 | |||
| f7eb75c57c | |||
| 004cc4910d | |||
| ed1cceed8c | |||
| 9fe9f061f8 | |||
| 837a1fb981 | |||
| 1f43b4e050 | |||
| 83bbc8a1bc | |||
| 896bdb6ee8 | |||
| 5407c26e25 | |||
| 4fddaba9c5 | |||
| d2f384b6eb | |||
| 25f0f30aaf | |||
| a57b037a91 | |||
| 5644231f9a | |||
| eea98e6d76 | |||
| 967455194c | |||
| 79dabf3efb | |||
| 1336f5b95c | |||
| 31486a31c1 | |||
| aa3fc332ba | |||
| 62c57b87f2 | |||
| f600261546 | |||
| d7ca04bdfb | |||
| 5433652c70 | |||
| b25f014dbd | |||
| d69a46f211 | |||
|
|
fc5c2019aa | ||
|
|
67a215c66f | ||
|
|
8b4bfdf5ad | ||
|
|
0a52a4f3ba | ||
|
|
b132f7973a | ||
|
|
bd94b6c792 | ||
|
|
06017eddfd | ||
|
|
0ac7b6a963 | ||
|
|
3d2ae4cdcb | ||
|
|
4669f14f4f | ||
|
|
540a9e39b8 | ||
|
|
58510207fa | ||
|
|
4cd7c9ab4e | ||
|
|
cfe255f6e4 | ||
|
|
8b9d3447d7 | ||
|
|
614b7cad61 | ||
|
|
9a1df9b1ba | ||
|
|
30cf014d58 | ||
|
|
27d478aed6 | ||
|
|
d36e8442a9 | ||
|
|
b345b0d14d | ||
|
|
0a1ac2ab9f | ||
|
|
1e62f828c6 | ||
|
|
f8c997d272 | ||
|
|
0c16616cc9 | ||
|
|
adcc147b32 | ||
|
|
94e233651e | ||
|
|
03c36f6cb2 | ||
|
|
a221814797 | ||
|
|
b6619d5ccc | ||
|
|
450193b063 | ||
|
|
ee8931f171 | ||
|
|
b771d95894 | ||
|
|
8e60e474dc | ||
|
|
2f4ec2acda | ||
|
|
7ed5cb0306 | ||
|
|
6df7ac68f5 | ||
|
|
0ce23aea4f | ||
|
|
36dff6e584 | ||
|
|
1c80906c1f | ||
|
|
2abe2ce3aa | ||
|
|
8233c9ee85 | ||
|
|
13189e2683 | ||
|
|
76a17937f3 | ||
|
|
b965184e71 | ||
|
|
b25a2f6d30 | ||
|
|
d18cde19c1 | ||
|
|
78c6dfc0ef | ||
|
|
72cf482ad3 | ||
|
|
a6023372b1 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +1,6 @@
|
||||
[submodule "bible"]
|
||||
path = bible
|
||||
url = https://git.mchus.pro/mchus/bible.git
|
||||
[submodule "internal/chart"]
|
||||
path = internal/chart
|
||||
url = https://git.mchus.pro/reanimator/chart.git
|
||||
|
||||
17
PLAN.md
17
PLAN.md
@@ -272,13 +272,10 @@ ISO image bootable via BMC virtual media or USB. Runs boot services automaticall
|
||||
|
||||
### 2.1 — Builder environment
|
||||
|
||||
`iso/builder/setup-builder.sh` prepares a Debian 12 host/VM with:
|
||||
- `live-build`, `debootstrap`, bootloader tooling, kernel headers
|
||||
- Go toolchain
|
||||
- everything needed to compile the `bee` binary and NVIDIA modules
|
||||
|
||||
`iso/builder/build-in-container.sh` offers the same builder stack in a Debian 12 container image.
|
||||
The container run is privileged because `live-build` needs mount/chroot/loop capabilities.
|
||||
`iso/builder/build-in-container.sh` is the only supported builder entrypoint.
|
||||
It builds a Debian 12 builder image with `live-build`, toolchains, and pinned kernel headers,
|
||||
then runs the ISO assembly in a privileged container because `live-build` needs
|
||||
mount/chroot/loop capabilities.
|
||||
|
||||
`iso/builder/build.sh` orchestrates the full ISO build:
|
||||
1. compile the Go `bee` binary
|
||||
@@ -346,9 +343,9 @@ Planned code shape:
|
||||
- `bee tui` can rerun the audit manually
|
||||
- `bee tui` can export the latest audit JSON to removable media
|
||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
||||
- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
|
||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||
|
||||
### 2.6 — Vendor utilities and optional assets
|
||||
@@ -392,7 +389,7 @@ No "works on my Mac" drift.
|
||||
|
||||
--- BUILDER + BEE ISO (unblock real-hardware testing) ---
|
||||
|
||||
2.1 builder setup → Debian host/VM or privileged container with build deps
|
||||
2.1 builder setup → privileged container with build deps
|
||||
2.2 debug ISO profile → minimal Debian ISO: `bee` binary + OpenSSH + all packages
|
||||
2.3 boot on real server → SSH in, verify packages present, run audit manually
|
||||
|
||||
|
||||
22
audit/Makefile
Normal file
22
audit/Makefile
Normal file
@@ -0,0 +1,22 @@
|
||||
LISTEN ?= :8080
|
||||
AUDIT_PATH ?=
|
||||
EXPORT_DIR ?= $(CURDIR)/.tmp/export
|
||||
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||
|
||||
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
|
||||
ifneq ($(AUDIT_PATH),)
|
||||
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||
endif
|
||||
|
||||
.PHONY: run build test
|
||||
|
||||
run:
|
||||
mkdir -p $(EXPORT_DIR)
|
||||
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||
|
||||
build:
|
||||
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||
|
||||
test:
|
||||
go test ./...
|
||||
@@ -1,29 +1,49 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"bee/audit/internal/tui"
|
||||
"bee/audit/internal/webui"
|
||||
)
|
||||
|
||||
var Version = "dev"
|
||||
|
||||
func buildLabel() string {
|
||||
label := strings.TrimSpace(Version)
|
||||
if label == "" {
|
||||
return "dev"
|
||||
}
|
||||
return label
|
||||
}
|
||||
|
||||
func main() {
|
||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||
}
|
||||
|
||||
func run(args []string, stdout, stderr io.Writer) int {
|
||||
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
})))
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
slog.Error("fatal panic",
|
||||
"panic", fmt.Sprint(rec),
|
||||
"stack", string(debug.Stack()),
|
||||
)
|
||||
exitCode = 1
|
||||
}
|
||||
}()
|
||||
|
||||
if len(args) == 0 {
|
||||
printRootUsage(stderr)
|
||||
@@ -39,12 +59,18 @@ func run(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
case "audit":
|
||||
return runAudit(args[1:], stdout, stderr)
|
||||
case "tui":
|
||||
return runTUI(args[1:], stdout, stderr)
|
||||
case "export":
|
||||
return runExport(args[1:], stdout, stderr)
|
||||
case "preflight":
|
||||
return runPreflight(args[1:], stdout, stderr)
|
||||
case "support-bundle":
|
||||
return runSupportBundle(args[1:], stdout, stderr)
|
||||
case "web":
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark(args[1:], stdout, stderr)
|
||||
case "version", "--version", "-version":
|
||||
fmt.Fprintln(stdout, Version)
|
||||
return 0
|
||||
@@ -58,9 +84,12 @@ func run(args []string, stdout, stderr io.Writer) int {
|
||||
func printRootUsage(w io.Writer) {
|
||||
fmt.Fprintln(w, `bee commands:
|
||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||
bee tui --runtime auto|local|livecd
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee export --target <device>
|
||||
bee sat nvidia|memory|storage
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -69,12 +98,18 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
switch args[0] {
|
||||
case "audit":
|
||||
return runAudit([]string{"--help"}, stdout, stdout)
|
||||
case "tui":
|
||||
return runTUI([]string{"--help"}, stdout, stdout)
|
||||
case "export":
|
||||
return runExport([]string{"--help"}, stdout, stdout)
|
||||
case "preflight":
|
||||
return runPreflight([]string{"--help"}, stdout, stdout)
|
||||
case "support-bundle":
|
||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||
case "web":
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||
case "version":
|
||||
fmt.Fprintln(stdout, "usage: bee version")
|
||||
return 0
|
||||
@@ -129,43 +164,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runTUI(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("tui", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee tui [--runtime auto|local|livecd]")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
|
||||
if err != nil {
|
||||
slog.Error("resolve runtime", "err", err)
|
||||
return 1
|
||||
}
|
||||
|
||||
slog.SetDefault(slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
})))
|
||||
|
||||
application := app.New(platform.New())
|
||||
if err := tui.Run(application, runtimeInfo.Mode); err != nil {
|
||||
slog.Error("run tui", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runExport(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
@@ -213,45 +211,273 @@ func runExport(args []string, stdout, stderr io.Writer) int {
|
||||
return 1
|
||||
}
|
||||
|
||||
func runPreflight(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("preflight", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee preflight [--output stdout|file:%s]\n", app.DefaultRuntimeJSONPath)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
application := app.New(platform.New())
|
||||
path, err := application.RunRuntimePreflight(*output)
|
||||
if err != nil {
|
||||
slog.Error("run preflight", "err", err)
|
||||
return 1
|
||||
}
|
||||
if path != "stdout" {
|
||||
slog.Info("runtime health written", "path", path)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee support-bundle [--output stdout|file:<path>]")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
path, err := app.BuildSupportBundle(app.DefaultExportDir)
|
||||
if err != nil {
|
||||
slog.Error("build support bundle", "err", err)
|
||||
return 1
|
||||
}
|
||||
defer os.Remove(path)
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
slog.Error("read support bundle", "err", err)
|
||||
return 1
|
||||
}
|
||||
switch {
|
||||
case *output == "stdout":
|
||||
if _, err := stdout.Write(raw); err != nil {
|
||||
slog.Error("write support bundle stdout", "err", err)
|
||||
return 1
|
||||
}
|
||||
case strings.HasPrefix(*output, "file:"):
|
||||
dst := strings.TrimPrefix(*output, "file:")
|
||||
if err := os.WriteFile(dst, raw, 0644); err != nil {
|
||||
slog.Error("write support bundle", "err", err)
|
||||
return 1
|
||||
}
|
||||
slog.Info("support bundle written", "path", dst)
|
||||
default:
|
||||
fmt.Fprintln(stderr, "bee support-bundle: unknown output destination")
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee web [--listen :80] [--audit-path %s] [--export-dir %s] [--title \"Bee Hardware Audit\"]\n", app.DefaultAuditJSONPath, app.DefaultExportDir)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
|
||||
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||
if err != nil {
|
||||
slog.Warn("resolve runtime for web", "err", err)
|
||||
}
|
||||
|
||||
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
||||
Title: *title,
|
||||
BuildLabel: buildLabel(),
|
||||
AuditPath: *auditPath,
|
||||
ExportDir: *exportDir,
|
||||
App: app.New(platform.New()),
|
||||
RuntimeMode: runtimeInfo.Mode,
|
||||
}); err != nil {
|
||||
slog.Error("run web", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
return 2
|
||||
}
|
||||
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage")
|
||||
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
return 0
|
||||
}
|
||||
if args[0] != "nvidia" && args[0] != "memory" && args[0] != "storage" {
|
||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", args[0])
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
|
||||
|
||||
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
||||
if err := fs.Parse(args[1:]); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if len(args) > 1 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
|
||||
if fs.NArg() != 0 {
|
||||
fmt.Fprintf(stderr, "bee sat: unexpected arguments\n")
|
||||
return 2
|
||||
}
|
||||
|
||||
target := args[0]
|
||||
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
label string
|
||||
)
|
||||
switch args[0] {
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
switch target {
|
||||
case "nvidia":
|
||||
label = "nvidia"
|
||||
archive, err = application.RunNvidiaAcceptancePack("")
|
||||
level := *diagLevel
|
||||
if level > 0 {
|
||||
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
||||
} else {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
label = "memory"
|
||||
archive, err = application.RunMemoryAcceptancePack("")
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||
case "storage":
|
||||
label = "storage"
|
||||
archive, err = application.RunStorageAcceptancePack("")
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
|
||||
}
|
||||
if err != nil {
|
||||
slog.Error("run sat", "target", label, "err", err)
|
||||
slog.Error("run sat", "target", target, "err", err)
|
||||
return 1
|
||||
}
|
||||
slog.Info("sat archive written", "target", label, "path", archive)
|
||||
slog.Info("sat archive written", "target", target, "path", archive)
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 2
|
||||
}
|
||||
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 0
|
||||
}
|
||||
target := args[0]
|
||||
if target != "nvidia" {
|
||||
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
|
||||
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 2
|
||||
}
|
||||
|
||||
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
|
||||
devices := fs.String("devices", "", "comma-separated GPU indices to include")
|
||||
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
|
||||
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
|
||||
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
|
||||
if err := fs.Parse(args[1:]); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
|
||||
return 2
|
||||
}
|
||||
|
||||
includeIndices, err := parseBenchmarkIndexCSV(*devices)
|
||||
if err != nil {
|
||||
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
|
||||
return 2
|
||||
}
|
||||
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
|
||||
if err != nil {
|
||||
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
|
||||
Profile: *profile,
|
||||
SizeMB: *sizeMB,
|
||||
GPUIndices: includeIndices,
|
||||
ExcludeGPUIndices: excludeIndices,
|
||||
RunNCCL: !*skipNCCL,
|
||||
}, logLine)
|
||||
if err != nil {
|
||||
slog.Error("run benchmark", "target", target, "err", err)
|
||||
return 1
|
||||
}
|
||||
slog.Info("benchmark archive written", "target", target, "path", archive)
|
||||
return 0
|
||||
}
|
||||
|
||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return nil, nil
|
||||
}
|
||||
var indices []int
|
||||
for _, part := range strings.Split(raw, ",") {
|
||||
part = strings.TrimSpace(part)
|
||||
if part == "" {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.Atoi(part)
|
||||
if err != nil || value < 0 {
|
||||
return nil, fmt.Errorf("bad gpu index %q", part)
|
||||
}
|
||||
indices = append(indices, value)
|
||||
}
|
||||
return indices, nil
|
||||
}
|
||||
|
||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunVersion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
old := Version
|
||||
Version = "test-version"
|
||||
t.Cleanup(func() { Version = old })
|
||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||
old := Version
|
||||
Version = "1.2.3"
|
||||
t.Cleanup(func() { Version = old })
|
||||
|
||||
if got := buildLabel(); got != "1.2.3" {
|
||||
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunExportRequiresTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -91,6 +99,32 @@ func TestRunSATUsage(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunPreflightRejectsExtraArgs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
rc := run([]string{"preflight", "extra"}, &stdout, &stderr)
|
||||
if rc != 2 {
|
||||
t.Fatalf("rc=%d want 2", rc)
|
||||
}
|
||||
if !strings.Contains(stderr.String(), "usage: bee preflight") {
|
||||
t.Fatalf("stderr missing preflight usage:\n%s", stderr.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunSupportBundleRejectsExtraArgs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
rc := run([]string{"support-bundle", "extra"}, &stdout, &stderr)
|
||||
if rc != 2 {
|
||||
t.Fatalf("rc=%d want 2", rc)
|
||||
}
|
||||
if !strings.Contains(stderr.String(), "usage: bee support-bundle") {
|
||||
t.Fatalf("stderr missing support-bundle usage:\n%s", stderr.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunHelpForSubcommand(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -138,7 +172,7 @@ func TestRunSATHelp(t *testing.T) {
|
||||
if rc != 0 {
|
||||
t.Fatalf("rc=%d want 0", rc)
|
||||
}
|
||||
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage") {
|
||||
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage|cpu") {
|
||||
t.Fatalf("stdout missing sat help:\n%s", stdout.String())
|
||||
}
|
||||
}
|
||||
@@ -151,8 +185,8 @@ func TestRunSATRejectsExtraArgs(t *testing.T) {
|
||||
if rc != 2 {
|
||||
t.Fatalf("rc=%d want 2", rc)
|
||||
}
|
||||
if !strings.Contains(stderr.String(), "usage: bee sat nvidia|memory|storage") {
|
||||
t.Fatalf("stderr missing sat usage:\n%s", stderr.String())
|
||||
if !strings.Contains(stderr.String(), "bee sat: unexpected arguments") {
|
||||
t.Fatalf("stderr missing sat error:\n%s", stderr.String())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
34
audit/go.mod
34
audit/go.mod
@@ -1,24 +1,22 @@
|
||||
module bee/audit
|
||||
|
||||
go 1.23
|
||||
go 1.25.0
|
||||
|
||||
require github.com/charmbracelet/bubbletea v1.3.4
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require (
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
||||
github.com/charmbracelet/lipgloss v1.0.0 // indirect
|
||||
github.com/charmbracelet/x/ansi v0.8.0 // indirect
|
||||
github.com/charmbracelet/x/term v0.2.1 // indirect
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/mattn/go-localereader v0.0.1 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.16 // indirect
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
|
||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
||||
github.com/muesli/termenv v0.15.2 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
golang.org/x/sync v0.11.0 // indirect
|
||||
golang.org/x/sys v0.30.0 // indirect
|
||||
golang.org/x/text v0.3.8 // indirect
|
||||
modernc.org/sqlite v1.48.0
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.72.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
)
|
||||
|
||||
82
audit/go.sum
82
audit/go.sum
@@ -1,37 +1,51 @@
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
|
||||
github.com/charmbracelet/bubbletea v1.3.4 h1:kCg7B+jSCFPLYRA52SDZjr51kG/fMUEoPoZrkaDHyoI=
|
||||
github.com/charmbracelet/bubbletea v1.3.4/go.mod h1:dtcUCyCGEX3g9tosuYiut3MXgY/Jsv9nKVdibKKRRXo=
|
||||
github.com/charmbracelet/lipgloss v1.0.0 h1:O7VkGDvqEdGi93X+DeqsQ7PKHDgtQfF8j8/O2qFMQNg=
|
||||
github.com/charmbracelet/lipgloss v1.0.0/go.mod h1:U5fy9Z+C38obMs+T+tJqst9VGzlOYGj4ri9reL3qUlo=
|
||||
github.com/charmbracelet/x/ansi v0.8.0 h1:9GTq3xq9caJW8ZrBTe0LIe2fvfLR/bYXKTx2llXn7xE=
|
||||
github.com/charmbracelet/x/ansi v0.8.0/go.mod h1:wdYl/ONOLHLIVmQaxbIYEC/cRKOQyjTkowiI4blgS9Q=
|
||||
github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
|
||||
github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
|
||||
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
|
||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
|
||||
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
|
||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
||||
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
|
||||
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
|
||||
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
||||
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
|
||||
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
@@ -17,17 +19,32 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
DefaultAuditJSONPath = "/var/log/bee-audit.json"
|
||||
DefaultAuditLogPath = "/var/log/bee-audit.log"
|
||||
DefaultSATBaseDir = "/var/log/bee-sat"
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
network networkManager
|
||||
services serviceManager
|
||||
exports exportManager
|
||||
tools toolManager
|
||||
sat satRunner
|
||||
network networkManager
|
||||
services serviceManager
|
||||
exports exportManager
|
||||
tools toolManager
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -41,10 +58,15 @@ type networkManager interface {
|
||||
DHCPOne(iface string) (string, error)
|
||||
DHCPAll() (string, error)
|
||||
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
||||
SetInterfaceState(iface string, up bool) error
|
||||
GetInterfaceState(iface string) (bool, error)
|
||||
CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
|
||||
RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
|
||||
}
|
||||
|
||||
type serviceManager interface {
|
||||
ListBeeServices() ([]string, error)
|
||||
ServiceState(name string) string
|
||||
ServiceStatus(name string) (string, error)
|
||||
ServiceDo(name string, action platform.ServiceAction) (string, error)
|
||||
}
|
||||
@@ -59,24 +81,127 @@ type toolManager interface {
|
||||
CheckTools(names []string) []platform.ToolStatus
|
||||
}
|
||||
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
||||
type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
LiveMediaRAMState() platform.LiveMediaRAMState
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
tools: platform,
|
||||
sat: platform,
|
||||
type GPUPresenceResult struct {
|
||||
Nvidia bool
|
||||
AMD bool
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUPresence() GPUPresenceResult {
|
||||
vendor := a.sat.DetectGPUVendor()
|
||||
return GPUPresenceResult{
|
||||
Nvidia: vendor == "nvidia",
|
||||
AMD: vendor == "amd",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
|
||||
return a.installer.LiveMediaRAMState()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||
ResetNvidiaGPU(index int) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error)
|
||||
CaptureTechnicalDump(baseDir string) error
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
tools: platform,
|
||||
sat: platform,
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
snap, err := readAuditSnapshot(auditJSON)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return schema.HardwareIngestRequest{}, err
|
||||
}
|
||||
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||
slog.Warn("capture technical dump", "err", err)
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
data, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -88,7 +213,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
@@ -97,6 +222,69 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) RunRuntimePreflight(output string) (string, error) {
|
||||
health, err := a.runtime.CollectRuntimeHealth(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
data, err := json.MarshalIndent(health, "", " ")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
switch {
|
||||
case output == "stdout":
|
||||
_, err := os.Stdout.Write(append(data, '\n'))
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
default:
|
||||
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) RunRuntimePreflightResult() (ActionResult, error) {
|
||||
path, err := a.RunRuntimePreflight("file:" + DefaultRuntimeJSONPath)
|
||||
body := "Runtime preflight completed."
|
||||
if path != "" {
|
||||
body = "Runtime health written to " + path
|
||||
}
|
||||
return ActionResult{Title: "Run self-check", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RuntimeHealthResult() ActionResult {
|
||||
health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath)
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
|
||||
}
|
||||
driverLabel := "Driver ready"
|
||||
accelLabel := "CUDA ready"
|
||||
switch a.sat.DetectGPUVendor() {
|
||||
case "amd":
|
||||
driverLabel = "AMDGPU ready"
|
||||
accelLabel = "ROCm SMI ready"
|
||||
case "nvidia":
|
||||
driverLabel = "NVIDIA ready"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
|
||||
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
|
||||
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
|
||||
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
|
||||
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
|
||||
if len(health.Issues) > 0 {
|
||||
body.WriteString("\n\nIssues:\n")
|
||||
for _, issue := range health.Issues {
|
||||
fmt.Fprintf(&body, "- %s: %s\n", issue.Code, issue.Description)
|
||||
}
|
||||
}
|
||||
return ActionResult{Title: "Runtime issues", Body: strings.TrimSpace(body.String())}
|
||||
}
|
||||
|
||||
func (a *App) RunAuditNow(runtimeMode runtimeenv.Mode) (ActionResult, error) {
|
||||
path, err := a.RunAudit(runtimeMode, "file:"+DefaultAuditJSONPath)
|
||||
body := "Audit completed."
|
||||
@@ -116,10 +304,13 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
}
|
||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||
data, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -129,13 +320,37 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit exported."
|
||||
if path != "" {
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
return a.exports.ExportFileToTarget(archive, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||
return a.network.ListInterfaces()
|
||||
}
|
||||
@@ -166,6 +381,22 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return a.network.CaptureNetworkSnapshot()
|
||||
}
|
||||
|
||||
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
@@ -222,6 +453,10 @@ func (a *App) ListBeeServices() ([]string, error) {
|
||||
return a.services.ListBeeServices()
|
||||
}
|
||||
|
||||
func (a *App) ServiceState(name string) string {
|
||||
return a.services.ServiceState(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatus(name string) (string, error) {
|
||||
return a.services.ServiceStatus(name)
|
||||
}
|
||||
@@ -277,12 +512,15 @@ func (a *App) AuditLogTailResult() ActionResult {
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir)
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.sat.RunNvidiaAcceptancePack(baseDir)
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
@@ -290,30 +528,304 @@ func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
return a.sat.RunMemoryAcceptancePack(baseDir)
|
||||
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.sat.RunMemoryAcceptancePack(baseDir)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "Memory SAT", Body: body}, err
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
return a.sat.RunStorageAcceptancePack(baseDir)
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.sat.RunStorageAcceptancePack(baseDir)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUVendor() string {
|
||||
return a.sat.DetectGPUVendor()
|
||||
}
|
||||
|
||||
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return ActionResult{Title: "Storage SAT", Body: body}, err
|
||||
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||
}
|
||||
|
||||
func (a *App) HealthSummaryResult() ActionResult {
|
||||
@@ -325,6 +837,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||
var body strings.Builder
|
||||
@@ -359,6 +872,7 @@ func (a *App) MainBanner() string {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ""
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
var lines []string
|
||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||
@@ -435,14 +949,68 @@ func bodyOr(body, fallback string) string {
|
||||
return body
|
||||
}
|
||||
|
||||
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||
// component-status DB so they are visible in the Hardware Summary card.
|
||||
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||
if db == nil || len(psus) == 0 {
|
||||
return
|
||||
}
|
||||
const source = "audit:ipmi"
|
||||
worstStatus := "OK"
|
||||
for _, psu := range psus {
|
||||
if psu.Status == nil {
|
||||
continue
|
||||
}
|
||||
slot := "?"
|
||||
if psu.Slot != nil {
|
||||
slot = *psu.Slot
|
||||
}
|
||||
st := *psu.Status
|
||||
detail := ""
|
||||
if psu.ErrorDescription != nil {
|
||||
detail = *psu.ErrorDescription
|
||||
}
|
||||
db.Record("psu:"+slot, source, st, detail)
|
||||
switch st {
|
||||
case "Critical":
|
||||
worstStatus = "Critical"
|
||||
case "Warning":
|
||||
if worstStatus != "Critical" {
|
||||
worstStatus = "Warning"
|
||||
}
|
||||
}
|
||||
}
|
||||
db.Record("psu:all", source, worstStatus, "")
|
||||
}
|
||||
|
||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return schema.RuntimeHealth{}, err
|
||||
}
|
||||
var health schema.RuntimeHealth
|
||||
if err := json.Unmarshal(raw, &health); err != nil {
|
||||
return schema.RuntimeHealth{}, err
|
||||
}
|
||||
return health, nil
|
||||
}
|
||||
|
||||
func latestSATSummaries() []string {
|
||||
patterns := []struct {
|
||||
label string
|
||||
prefix string
|
||||
}{
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
}
|
||||
var out []string
|
||||
for _, item := range patterns {
|
||||
@@ -647,12 +1215,17 @@ func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
class := trimPtr(dev.DeviceClass)
|
||||
model := strings.ToLower(trimPtr(dev.Model))
|
||||
vendor := strings.ToLower(trimPtr(dev.Manufacturer))
|
||||
// Exclude ASPEED (BMC VGA adapter, not a compute GPU)
|
||||
if strings.Contains(vendor, "aspeed") || strings.Contains(model, "aspeed") {
|
||||
return false
|
||||
}
|
||||
// AMD Instinct / Radeon compute GPUs have class ProcessingAccelerator or DisplayController.
|
||||
// Do NOT match by AMD vendor alone — chipset/CPU PCIe devices share that vendor.
|
||||
return class == "VideoController" ||
|
||||
class == "DisplayController" ||
|
||||
class == "ProcessingAccelerator" ||
|
||||
strings.Contains(model, "nvidia") ||
|
||||
strings.Contains(vendor, "nvidia") ||
|
||||
strings.Contains(vendor, "amd")
|
||||
strings.Contains(vendor, "nvidia")
|
||||
}
|
||||
|
||||
func trimPtr(value *string) string {
|
||||
@@ -725,3 +1298,70 @@ func firstNonEmpty(values ...string) string {
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||
return a.installer.ListInstallDisks()
|
||||
}
|
||||
|
||||
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||
}
|
||||
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
@@ -39,6 +43,13 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
|
||||
return f.setStaticIPv4Fn(cfg)
|
||||
}
|
||||
|
||||
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
||||
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
||||
func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return platform.NetworkSnapshot{}, nil
|
||||
}
|
||||
func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
|
||||
|
||||
type fakeServices struct {
|
||||
serviceStatusFn func(string) (string, error)
|
||||
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
||||
@@ -48,6 +59,10 @@ func (f fakeServices) ListBeeServices() ([]string, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeServices) ServiceState(name string) string {
|
||||
return "active"
|
||||
}
|
||||
|
||||
func (f fakeServices) ServiceStatus(name string) (string, error) {
|
||||
return f.serviceStatusFn(name)
|
||||
}
|
||||
@@ -56,16 +71,41 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
|
||||
return f.serviceDoFn(name, action)
|
||||
}
|
||||
|
||||
type fakeExports struct{}
|
||||
type fakeExports struct {
|
||||
listTargetsFn func() ([]platform.RemovableTarget, error)
|
||||
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
if f.listTargetsFn != nil {
|
||||
return f.listTargetsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
|
||||
if f.exportToTargetFn != nil {
|
||||
return f.exportToTargetFn(src, target)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
type fakeRuntime struct {
|
||||
collectFn func(string) (schema.RuntimeHealth, error)
|
||||
dumpFn func(string) error
|
||||
}
|
||||
|
||||
func (f fakeRuntime) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
return f.collectFn(exportDir)
|
||||
}
|
||||
|
||||
func (f fakeRuntime) CaptureTechnicalDump(baseDir string) error {
|
||||
if f.dumpFn != nil {
|
||||
return f.dumpFn(baseDir)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeTools struct {
|
||||
tailFileFn func(string, int) string
|
||||
checkToolsFn func([]string) []platform.ToolStatus
|
||||
@@ -80,23 +120,211 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
}
|
||||
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNCCLFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||
resetNvidiaGPUFn func(int) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaBenchmarkFn != nil {
|
||||
return f.runNvidiaBenchmarkFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerBenchFn != nil {
|
||||
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaComputeFn != nil {
|
||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerFn != nil {
|
||||
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPulseFn != nil {
|
||||
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaBandwidthFn != nil {
|
||||
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaStressFn != nil {
|
||||
return f.runNvidiaStressFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
if f.listNvidiaGPUsFn != nil {
|
||||
return f.listNvidiaGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
if f.listNvidiaGPUStatusesFn != nil {
|
||||
return f.listNvidiaGPUStatusesFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||
if f.resetNvidiaGPUFn != nil {
|
||||
return f.resetNvidiaGPUFn(index)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||
if f.runCPUFn != nil {
|
||||
return f.runCPUFn(baseDir, durationSec)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) DetectGPUVendor() string {
|
||||
if f.detectVendorFn != nil {
|
||||
return f.detectVendorFn()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
if f.listAMDGPUsFn != nil {
|
||||
return f.listAMDGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
if f.runAMDPackFn != nil {
|
||||
return f.runAMDPackFn(baseDir)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNCCLFn != nil {
|
||||
return f.runNCCLFn(baseDir, gpuIndices)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotBaseDir string
|
||||
var gotGPUIndices []int
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||
gotBaseDir = baseDir
|
||||
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||
return "/tmp/nccl-tests.tar.gz", nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("RunNCCLTests error: %v", err)
|
||||
}
|
||||
if path != "/tmp/nccl-tests.tar.gz" {
|
||||
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||
}
|
||||
if gotBaseDir != "/tmp/sat" {
|
||||
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||
}
|
||||
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -110,6 +338,9 @@ func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
},
|
||||
defaultRouteFn: func() string { return "10.0.0.1" },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.NetworkStatus()
|
||||
@@ -138,6 +369,9 @@ func TestNetworkStatusHandlesNoInterfaces(t *testing.T) {
|
||||
listInterfacesFn: func() ([]platform.InterfaceInfo, error) { return nil, nil },
|
||||
defaultRouteFn: func() string { return "" },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.NetworkStatus()
|
||||
@@ -159,6 +393,9 @@ func TestNetworkStatusPropagatesListError(t *testing.T) {
|
||||
},
|
||||
defaultRouteFn: func() string { return "" },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.NetworkStatus()
|
||||
@@ -183,6 +420,9 @@ func TestParseStaticIPv4ConfigAndDefaults(t *testing.T) {
|
||||
dhcpAllFn: func() (string, error) { return "", nil },
|
||||
setStaticIPv4Fn: func(platform.StaticIPv4Config) (string, error) { return "", nil },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
defaults := a.DefaultStaticIPv4FormFields("eth0")
|
||||
@@ -219,6 +459,9 @@ func TestServiceActionResults(t *testing.T) {
|
||||
return string(action) + " ok", nil
|
||||
},
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
statusResult, err := a.ServiceStatusResult("bee-audit")
|
||||
@@ -301,6 +544,11 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
runMemoryFn: func(string) (string, error) { return "", nil },
|
||||
runStorageFn: func(string) (string, error) { return "", nil },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) {
|
||||
return schema.RuntimeHealth{Status: "PARTIAL", ExportDir: "/tmp/export"}, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if got, _ := a.DHCPOneResult("eth0"); got.Body != "DHCP completed." {
|
||||
@@ -327,14 +575,83 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
if got, _ := a.RunNvidiaAcceptancePackResult(""); got.Body != "Archive written." {
|
||||
t.Fatalf("sat body=%q", got.Body)
|
||||
}
|
||||
if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "Archive written." {
|
||||
if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "No output produced." {
|
||||
t.Fatalf("memory sat body=%q", got.Body)
|
||||
}
|
||||
if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "Archive written." {
|
||||
if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "No output produced." {
|
||||
t.Fatalf("storage sat body=%q", got.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
|
||||
if filepath.Base(src) == "" {
|
||||
t.Fatalf("expected non-empty source path")
|
||||
}
|
||||
return "/media/bee/" + filepath.Base(src), nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportSupportBundleResult error: %v", err)
|
||||
}
|
||||
if result.Title != "Export support bundle" {
|
||||
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
|
||||
}
|
||||
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
|
||||
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
|
||||
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
|
||||
if err == nil {
|
||||
t.Fatal("expected export error")
|
||||
}
|
||||
if contains(result.Body, "exported to") {
|
||||
t.Fatalf("body should not claim success:\n%s", result.Body)
|
||||
}
|
||||
if result.Body != "Support bundle export failed." {
|
||||
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -349,6 +666,9 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
runMemoryFn: func(string) (string, error) { return "", nil },
|
||||
runStorageFn: func(string) (string, error) { return "", nil },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.RunNvidiaAcceptancePackResult("/tmp/sat")
|
||||
@@ -360,6 +680,48 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNvidiaFn: func(baseDir string) (string, error) {
|
||||
if baseDir != "/tmp/export/bee-sat" {
|
||||
t.Fatalf("nvidia baseDir=%q", baseDir)
|
||||
}
|
||||
return "", nil
|
||||
},
|
||||
runMemoryFn: func(baseDir string) (string, error) {
|
||||
if baseDir != "/tmp/export/bee-sat" {
|
||||
t.Fatalf("memory baseDir=%q", baseDir)
|
||||
}
|
||||
return "", nil
|
||||
},
|
||||
runStorageFn: func(baseDir string) (string, error) {
|
||||
if baseDir != "/tmp/export/bee-sat" {
|
||||
t.Fatalf("storage baseDir=%q", baseDir)
|
||||
}
|
||||
return "", nil
|
||||
},
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatSATSummary(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -398,6 +760,143 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
raw := `{
|
||||
"collected_at": "2026-03-15T10:00:00Z",
|
||||
"hardware": {
|
||||
"board": {"serial_number": "SRV123"},
|
||||
"storage": [
|
||||
{"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
|
||||
{"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
|
||||
],
|
||||
"pcie_devices": [
|
||||
{"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
|
||||
{"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
|
||||
]
|
||||
}
|
||||
}`
|
||||
|
||||
got, err := ApplySATOverlay([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("ApplySATOverlay error: %v", err)
|
||||
}
|
||||
text := string(got)
|
||||
if contains(text, "Virtual HDisk0") {
|
||||
t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
|
||||
}
|
||||
if contains(text, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
|
||||
}
|
||||
if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
|
||||
t.Fatalf("overlaid audit should keep real devices:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
exportDir := filepath.Join(tmp, "export")
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
archive, err := BuildSupportBundle(exportDir)
|
||||
if err != nil {
|
||||
t.Fatalf("BuildSupportBundle error: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(archive); err != nil {
|
||||
t.Fatalf("archive stat: %v", err)
|
||||
}
|
||||
|
||||
file, err := os.Open(archive)
|
||||
if err != nil {
|
||||
t.Fatalf("open archive: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gzr, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
t.Fatalf("gzip reader: %v", err)
|
||||
}
|
||||
defer gzr.Close()
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
if contains(hdr.Name, "/export/bee-audit.json") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read audit entry: %v", err)
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
"/system/ip-link.txt",
|
||||
"/system/ip-link-stats.txt",
|
||||
"/system/kernel-aer-nvidia.txt",
|
||||
"/system/lspci-nvidia-bridges-vv.txt",
|
||||
"/system/pcie-aer-sysfs.txt",
|
||||
"/system/ethtool-info.txt",
|
||||
"/system/ethtool-link.txt",
|
||||
"/system/ethtool-module.txt",
|
||||
"/system/mstflint-query.txt",
|
||||
} {
|
||||
var found bool
|
||||
for _, name := range names {
|
||||
if contains(name, want) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||
}
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
for _, name := range names {
|
||||
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||
foundRaw = true
|
||||
}
|
||||
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
|
||||
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
|
||||
}
|
||||
}
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldAuditPath := DefaultAuditJSONPath
|
||||
@@ -409,6 +908,10 @@ func TestMainBanner(t *testing.T) {
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
memorySerialA := "DIMM-A"
|
||||
memorySerialB := "DIMM-B"
|
||||
storageSerialA := "DISK-A"
|
||||
storageSerialB := "DISK-B"
|
||||
gpuClass := "VideoController"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
@@ -424,12 +927,12 @@ func TestMainBanner(t *testing.T) {
|
||||
{Model: &cpuModel},
|
||||
},
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
@@ -472,6 +975,44 @@ func TestMainBanner(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldRuntimePath := DefaultRuntimeJSONPath
|
||||
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
|
||||
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
|
||||
|
||||
raw, err := json.Marshal(schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: "/appdata/bee/export",
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
NetworkStatus: "OK",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal runtime health: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
|
||||
t.Fatalf("write runtime health: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
detectVendorFn: func() string { return "amd" },
|
||||
},
|
||||
}
|
||||
|
||||
result := a.RuntimeHealthResult()
|
||||
if !contains(result.Body, "AMDGPU ready: true") {
|
||||
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
|
||||
}
|
||||
if !contains(result.Body, "ROCm SMI ready: true") {
|
||||
t.Fatalf("body missing ROCm label:\n%s", result.Body)
|
||||
}
|
||||
if contains(result.Body, "CUDA ready") {
|
||||
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func intPtr(v int) *int { return &v }
|
||||
|
||||
func contains(haystack, needle string) bool {
|
||||
|
||||
67
audit/internal/app/atomic_write.go
Normal file
67
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// readFileLimited reads path into memory, refusing files larger than maxBytes.
|
||||
// Prevents OOM on corrupted or unexpectedly large data files.
|
||||
func readFileLimited(path string, maxBytes int64) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > maxBytes {
|
||||
return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||
}
|
||||
|
||||
tmpPath := path + ".tmp"
|
||||
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||
}
|
||||
|
||||
success := false
|
||||
defer func() {
|
||||
_ = f.Close()
|
||||
if !success {
|
||||
_ = os.Remove(tmpPath)
|
||||
}
|
||||
}()
|
||||
|
||||
if _, err := f.Write(data); err != nil {
|
||||
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := os.Rename(tmpPath, path); err != nil {
|
||||
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||
}
|
||||
|
||||
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||
_ = dir.Sync()
|
||||
_ = dir.Close()
|
||||
}
|
||||
|
||||
success = true
|
||||
return nil
|
||||
}
|
||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||
t.Fatalf("seed file: %v", err)
|
||||
}
|
||||
|
||||
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||
t.Fatalf("atomicWriteFile: %v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read final: %v", err)
|
||||
}
|
||||
if string(raw) != "new\n" {
|
||||
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||
a := &App{
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||
return schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: exportDir,
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
}, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got, err := a.RunRuntimePreflight("file:" + path)
|
||||
if err != nil {
|
||||
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||
}
|
||||
if got != path {
|
||||
t.Fatalf("path=%q want %q", got, path)
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read runtime file: %v", err)
|
||||
}
|
||||
var health schema.RuntimeHealth
|
||||
if err := json.Unmarshal(raw, &health); err != nil {
|
||||
t.Fatalf("json unmarshal: %v", err)
|
||||
}
|
||||
if health.Status != "OK" {
|
||||
t.Fatalf("status=%q want OK", health.Status)
|
||||
}
|
||||
}
|
||||
268
audit/internal/app/component_status_db.go
Normal file
268
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,268 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := readFileLimited(path, 10<<20)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
|
||||
"amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
421
audit/internal/app/sat_overlay.go
Normal file
421
audit/internal/app/sat_overlay.go
Normal file
@@ -0,0 +1,421 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "amd", summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok {
|
||||
applyCPUSAT(snap.CPUs, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type nvidiaPerGPUStatus struct {
|
||||
runStatus string
|
||||
reason string
|
||||
}
|
||||
|
||||
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if devs[i].Telemetry == nil {
|
||||
continue
|
||||
}
|
||||
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
idx, ok := telemetryInt(rawIdx)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
st, ok := statusByIndex[idx]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
runDir := matches[len(matches)-1]
|
||||
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return nil, "", false
|
||||
}
|
||||
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||
for _, file := range files {
|
||||
raw, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[idx] = nvidiaPerGPUStatus{
|
||||
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||
reason: strings.TrimSpace(kv["reason"]),
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
return out, runAtUTC, true
|
||||
}
|
||||
|
||||
func telemetryInt(v any) (int, bool) {
|
||||
switch value := v.(type) {
|
||||
case int:
|
||||
return value, true
|
||||
case int32:
|
||||
return int(value), true
|
||||
case int64:
|
||||
return int(value), true
|
||||
case float64:
|
||||
return int(value), true
|
||||
case string:
|
||||
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
runAtUTC string
|
||||
overall string
|
||||
kv map[string]string
|
||||
}
|
||||
|
||||
func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return satSummary{}, false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
return satSummary{}, false
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
return satSummary{
|
||||
runAtUTC: strings.TrimSpace(kv["run_at_utc"]),
|
||||
overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])),
|
||||
kv: kv,
|
||||
}, true
|
||||
}
|
||||
|
||||
func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if !matchesGPUVendor(devs[i], vendor) {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "memory SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range dimms {
|
||||
mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "CPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range cpus {
|
||||
mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) {
|
||||
byDevice := parseStorageSATStatus(summary)
|
||||
for i := range disks {
|
||||
devPath, _ := disks[i].Telemetry["linux_device"].(string)
|
||||
devName := filepath.Base(strings.TrimSpace(devPath))
|
||||
if devName == "" {
|
||||
continue
|
||||
}
|
||||
result, ok := byDevice[devName]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description)
|
||||
}
|
||||
}
|
||||
|
||||
type satStatusResult struct {
|
||||
status string
|
||||
description string
|
||||
ok bool
|
||||
}
|
||||
|
||||
func parseStorageSATStatus(summary satSummary) map[string]satStatusResult {
|
||||
result := map[string]satStatusResult{}
|
||||
for key, value := range summary.kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
step := strings.ReplaceAll(base[idx+1:], "_", "-")
|
||||
stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
current := result[devName]
|
||||
if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) {
|
||||
result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
||||
return satKeyStatus(summary.overall, label)
|
||||
}
|
||||
|
||||
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||
case "OK":
|
||||
// No error description on success — error_description is for problems only.
|
||||
return "OK", "", true
|
||||
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||
// Tool couldn't run or test was incomplete — we can't assert hardware health.
|
||||
return "Unknown", "", true
|
||||
case "FAILED":
|
||||
return "Critical", label + " failed", true
|
||||
default:
|
||||
return "", "", false
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
}
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
newSeverity := statusSeverity(satStatus)
|
||||
currentSeverity := statusSeverity(current)
|
||||
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||
return
|
||||
}
|
||||
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
case "Unknown":
|
||||
return 1 // same as OK — does not override OK from another source
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
}
|
||||
return *v
|
||||
}
|
||||
|
||||
func appStringPtr(value string) *string {
|
||||
return &value
|
||||
}
|
||||
109
audit/internal/app/sat_overlay_test.go
Normal file
109
audit/internal/app/sat_overlay_test.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "storage-20260325-161151")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:11:51Z\nnvme0n1_nvme_smart_log_status=OK\nsda_smartctl_health_status=FAILED\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
nvme := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/nvme0n1"}}
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
}
|
||||
if snap.Storage[1].Status == nil || *snap.Storage[1].Status != "Critical" {
|
||||
t.Fatalf("sda status=%v want Critical", snap.Storage[1].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-amd-20260325-161436")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:14:36Z\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "VideoController"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
bdf0 := "0000:4b:00.0"
|
||||
bdf1 := "0000:4f:00.0"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf0,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||
},
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf1,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||
}
|
||||
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||
got := "<nil>"
|
||||
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||
got = *snap.PCIeDevices[1].ErrorDescription
|
||||
}
|
||||
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||
}
|
||||
}
|
||||
699
audit/internal/app/support_bundle.go
Normal file
699
audit/internal/app/support_bundle.go
Normal file
@@ -0,0 +1,699 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var supportBundleServices = []string{
|
||||
"bee-audit.service",
|
||||
"bee-web.service",
|
||||
"bee-network.service",
|
||||
"bee-nvidia.service",
|
||||
"bee-preflight.service",
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
name string
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
nvidia-smi topo -m 2>&1 || true
|
||||
else
|
||||
echo "nvidia-smi not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||
`}},
|
||||
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||
for candidate in \
|
||||
/usr/bin/nvidia-fabricmanager \
|
||||
/usr/bin/nv-fabricmanager \
|
||||
/usr/bin/nvidia-fabricmanagerd \
|
||||
/usr/bin/nvlsm; do
|
||||
if [ -e "$candidate" ]; then
|
||||
echo "=== $candidate ==="
|
||||
ls -l "$candidate" 2>&1 || true
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||
echo "no fabric manager binaries found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||
echo
|
||||
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||
lspci -s "$bridge" -vv 2>&1 || true
|
||||
fi
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no NVIDIA PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
class=$(cat "$d/class" 2>/dev/null)
|
||||
case "$class" in
|
||||
0x030000|0x030200) ;;
|
||||
*) continue ;;
|
||||
esac
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||
found=0
|
||||
for dev in /sys/bus/pci/devices/*; do
|
||||
[ -e "$dev" ] || continue
|
||||
bdf=$(basename "$dev")
|
||||
block=""
|
||||
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||
if [ -r "$dev/$f" ]; then
|
||||
if [ -z "$block" ]; then
|
||||
block=1
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
fi
|
||||
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||
fi
|
||||
done
|
||||
if [ -n "$block" ]; then
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no PCIe AER sysfs counters found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -i "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -m "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v mstflint >/dev/null 2>&1; then
|
||||
echo "mstflint not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/bus/pci/devices/*; do
|
||||
[ -e "$path/vendor" ] || continue
|
||||
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x15b3" ] || continue
|
||||
bdf=$(basename "$path")
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
mstflint -d "$bdf" q 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no Mellanox/NVIDIA networking devices found"
|
||||
fi
|
||||
`}},
|
||||
}
|
||||
|
||||
var supportBundleOptionalFiles = []struct {
|
||||
name string
|
||||
src string
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
exportDir = DefaultExportDir
|
||||
}
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := cleanupOldSupportBundles(os.TempDir()); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.RemoveAll(stageRoot)
|
||||
|
||||
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, svc := range supportBundleServices {
|
||||
if err := writeCommandOutput(filepath.Join(stageRoot, "systemd", svc+".status.txt"), []string{"systemctl", "status", svc, "--no-pager"}); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := writeCommandOutput(filepath.Join(stageRoot, "systemd", svc+".journal.log"), []string{"journalctl", "--no-pager", "-u", svc}); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleCommands {
|
||||
if err := writeCommandOutput(filepath.Join(stageRoot, item.name), item.cmd); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
|
||||
func cleanupOldSupportBundles(dir string) error {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
entries := supportBundleEntries(matches)
|
||||
for path, mod := range entries {
|
||||
if time.Since(mod) > 24*time.Hour {
|
||||
_ = os.Remove(path)
|
||||
delete(entries, path)
|
||||
}
|
||||
}
|
||||
ordered := orderSupportBundles(entries)
|
||||
if len(ordered) > 3 {
|
||||
for _, old := range ordered[3:] {
|
||||
_ = os.Remove(old)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func latestSupportBundlePath(dir string) (string, error) {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||
if len(ordered) == 0 {
|
||||
return "", os.ErrNotExist
|
||||
}
|
||||
return ordered[0], nil
|
||||
}
|
||||
|
||||
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||
entries := make(map[string]time.Time, len(matches))
|
||||
for _, match := range matches {
|
||||
info, err := os.Stat(match)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
entries[match] = info.ModTime()
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||
ordered := make([]string, 0, len(entries))
|
||||
for path := range entries {
|
||||
ordered = append(ordered, path)
|
||||
}
|
||||
sort.Slice(ordered, func(i, j int) bool {
|
||||
return entries[ordered[i]].After(entries[ordered[j]])
|
||||
})
|
||||
return ordered
|
||||
}
|
||||
|
||||
func writeJournalDump(dst string) error {
|
||||
args := []string{"--no-pager"}
|
||||
for _, svc := range supportBundleServices {
|
||||
args = append(args, "-u", svc)
|
||||
}
|
||||
raw, err := exec.Command("journalctl", args...).CombinedOutput()
|
||||
if len(raw) == 0 && err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
}
|
||||
if len(raw) == 0 {
|
||||
raw = []byte("no journal output\n")
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func writeCommandOutput(dst string, cmd []string) error {
|
||||
if len(cmd) == 0 {
|
||||
return nil
|
||||
}
|
||||
raw, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
if len(raw) == 0 {
|
||||
if err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
} else {
|
||||
raw = []byte("no output\n")
|
||||
}
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func copyOptionalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "bee_version=%s\n", buildVersion())
|
||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||
fmt.Fprintf(&body, "\nfiles:\n")
|
||||
|
||||
var files []string
|
||||
if err := filepath.Walk(stageRoot, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil || info.IsDir() {
|
||||
return err
|
||||
}
|
||||
if filepath.Clean(path) == filepath.Clean(dst) {
|
||||
return nil
|
||||
}
|
||||
rel, err := filepath.Rel(stageRoot, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
files = append(files, fmt.Sprintf("%s\t%d", rel, info.Size()))
|
||||
return nil
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
sort.Strings(files)
|
||||
for _, line := range files {
|
||||
body.WriteString(line)
|
||||
body.WriteByte('\n')
|
||||
}
|
||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||
}
|
||||
|
||||
func bundleVersion() string {
|
||||
v := buildVersion()
|
||||
v = strings.TrimPrefix(v, "v")
|
||||
v = strings.TrimPrefix(v, "V")
|
||||
if v == "" || v == "unknown" {
|
||||
return "0.0"
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func serverModelForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Product Name" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return strings.ReplaceAll(val, " ", "_")
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func serverSerialForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Serial Number" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return val
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func buildVersion() string {
|
||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
return strings.TrimSpace(string(raw))
|
||||
}
|
||||
|
||||
func copyDirContents(srcDir, dstDir string) error {
|
||||
entries, err := os.ReadDir(srcDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
src := filepath.Join(srcDir, entry.Name())
|
||||
dst := filepath.Join(dstDir, entry.Name())
|
||||
if err := copyPath(src, dst); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
|
||||
}
|
||||
|
||||
func normalizeSupportBundleAuditJSON(path string) error {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
normalized, err := ApplySATOverlay(data)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return os.WriteFile(path, normalized, 0644)
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
entries, err := os.ReadDir(srcDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
src := filepath.Join(srcDir, entry.Name())
|
||||
dst := filepath.Join(dstDir, entry.Name())
|
||||
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyPath(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||
return err
|
||||
}
|
||||
entries, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if err := copyPath(filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
|
||||
out, err := os.OpenFile(dst, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, info.Mode().Perm())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(rootSrc, src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if keep != nil && !keep(rel, info) {
|
||||
return nil
|
||||
}
|
||||
if info.IsDir() {
|
||||
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||
return err
|
||||
}
|
||||
entries, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return copyPath(src, dst)
|
||||
}
|
||||
|
||||
func createSupportTarGz(dst, srcDir string) error {
|
||||
file, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gz := gzip.NewWriter(file)
|
||||
defer gz.Close()
|
||||
|
||||
tw := tar.NewWriter(gz)
|
||||
defer tw.Close()
|
||||
|
||||
base := filepath.Dir(srcDir)
|
||||
return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
header, err := tar.FileInfoHeader(info, "")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
header.Name, err = filepath.Rel(base, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := tw.WriteHeader(header); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
_, err = io.Copy(tw, f)
|
||||
return err
|
||||
})
|
||||
}
|
||||
252
audit/internal/collector/amdgpu.go
Normal file
252
audit/internal/collector/amdgpu.go
Normal file
@@ -0,0 +1,252 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
amdSMIExecCommand = exec.Command
|
||||
amdSMILookPath = exec.LookPath
|
||||
amdSMIGlob = filepath.Glob
|
||||
)
|
||||
|
||||
var amdSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
"/usr/local/bin/rocm-smi",
|
||||
}
|
||||
|
||||
type amdGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
Product string
|
||||
Firmware string
|
||||
PowerW *float64
|
||||
TempC *float64
|
||||
}
|
||||
|
||||
func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
if !hasAMDGPUDevices(devs) {
|
||||
return devs
|
||||
}
|
||||
infoByBDF, err := queryAMDGPUs()
|
||||
if err != nil {
|
||||
slog.Info("amdgpu: enrichment skipped", "err", err)
|
||||
return devs
|
||||
}
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(info.Serial) != "" {
|
||||
devs[i].SerialNumber = &info.Serial
|
||||
}
|
||||
if strings.TrimSpace(info.Firmware) != "" {
|
||||
devs[i].Firmware = &info.Firmware
|
||||
}
|
||||
if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil {
|
||||
devs[i].Model = &info.Product
|
||||
}
|
||||
if info.PowerW != nil {
|
||||
devs[i].PowerW = info.PowerW
|
||||
}
|
||||
if info.TempC != nil {
|
||||
devs[i].TemperatureC = info.TempC
|
||||
}
|
||||
enriched++
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("amdgpu: enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
for _, dev := range devs {
|
||||
if isAMDGPUDevice(dev) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
busByCard, err := queryAMDField("--showbus")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
infoByCard := map[string]amdGPUInfo{}
|
||||
for card, bus := range busByCard {
|
||||
bdf := normalizePCIeBDF(bus)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
infoByCard[card] = amdGPUInfo{BDF: bdf}
|
||||
}
|
||||
if len(infoByCard) == 0 {
|
||||
return map[string]amdGPUInfo{}, nil
|
||||
}
|
||||
mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value })
|
||||
mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value })
|
||||
mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value })
|
||||
mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value })
|
||||
mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value })
|
||||
|
||||
result := make(map[string]amdGPUInfo, len(infoByCard))
|
||||
for _, info := range infoByCard {
|
||||
if info.BDF == "" {
|
||||
continue
|
||||
}
|
||||
result[info.BDF] = info
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) {
|
||||
values, err := queryAMDNumericField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func queryAMDField(flag string) (map[string]string, error) {
|
||||
cmd, err := resolveAMDSMICmd(flag, "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parseROCmSingleValueCSV(string(out)), nil
|
||||
}
|
||||
|
||||
func queryAMDNumericField(flag string) (map[string]float64, error) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := map[string]float64{}
|
||||
for card, raw := range values {
|
||||
if value, ok := firstFloat(raw); ok {
|
||||
out[card] = value
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func resolveAMDSMICmd(args ...string) ([]string, error) {
|
||||
if path, err := amdSMILookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
for _, pattern := range amdSMIExecutableGlobs {
|
||||
matches, err := amdSMIGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
return append([]string{match}, args...), nil
|
||||
}
|
||||
}
|
||||
return nil, exec.ErrNotFound
|
||||
}
|
||||
|
||||
func parseROCmSingleValueCSV(raw string) map[string]string {
|
||||
rows := map[string]string{}
|
||||
reader := csv.NewReader(strings.NewReader(raw))
|
||||
reader.FieldsPerRecord = -1
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return rows
|
||||
}
|
||||
for _, rec := range records {
|
||||
if len(rec) < 2 {
|
||||
continue
|
||||
}
|
||||
card := normalizeROCmCardKey(rec[0])
|
||||
if card == "" {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Join(rec[1:], ","))
|
||||
if value == "" || looksLikeCSVHeaderValue(value) {
|
||||
continue
|
||||
}
|
||||
rows[card] = value
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
func normalizeROCmCardKey(raw string) string {
|
||||
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||
raw = strings.Trim(raw, "\"")
|
||||
if raw == "" {
|
||||
return ""
|
||||
}
|
||||
if raw == "device" || raw == "gpu" || raw == "card" {
|
||||
return ""
|
||||
}
|
||||
if strings.HasPrefix(raw, "card") {
|
||||
return raw
|
||||
}
|
||||
if _, err := strconv.Atoi(raw); err == nil {
|
||||
return "card" + raw
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeCSVHeaderValue(value string) bool {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
return strings.Contains(value, "product") ||
|
||||
strings.Contains(value, "serial") ||
|
||||
strings.Contains(value, "vbios") ||
|
||||
strings.Contains(value, "bus")
|
||||
}
|
||||
56
audit/internal/collector/amdgpu_test.go
Normal file
56
audit/internal/collector/amdgpu_test.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseROCmSingleValueCSV(t *testing.T) {
|
||||
raw := "device,Serial Number\ncard0,ABC123\ncard1,XYZ789\n"
|
||||
got := parseROCmSingleValueCSV(raw)
|
||||
if got["card0"] != "ABC123" {
|
||||
t.Fatalf("card0=%q want ABC123", got["card0"])
|
||||
}
|
||||
if got["card1"] != "XYZ789" {
|
||||
t.Fatalf("card1=%q want XYZ789", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryAMDNumericFieldParsesUnits(t *testing.T) {
|
||||
origExec := amdSMIExecCommand
|
||||
origLookPath := amdSMILookPath
|
||||
t.Cleanup(func() {
|
||||
amdSMIExecCommand = origExec
|
||||
amdSMILookPath = origLookPath
|
||||
})
|
||||
|
||||
amdSMILookPath = func(string) (string, error) { return "/usr/bin/rocm-smi", nil }
|
||||
amdSMIExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "printf 'device,Temperature\\ncard0,45.5c\\ncard1,67.0c\\n'")
|
||||
}
|
||||
|
||||
got, err := queryAMDNumericField("--showtemp")
|
||||
if err != nil {
|
||||
t.Fatalf("queryAMDNumericField: %v", err)
|
||||
}
|
||||
if got["card0"] != 45.5 {
|
||||
t.Fatalf("card0=%v want 45.5", got["card0"])
|
||||
}
|
||||
if got["card1"] != 67.0 {
|
||||
t.Fatalf("card1=%v want 67.0", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeROCmCardKey(t *testing.T) {
|
||||
tests := map[string]string{
|
||||
"0": "card0",
|
||||
"card1": "card1",
|
||||
"Device": "",
|
||||
"": "",
|
||||
}
|
||||
for input, want := range tests {
|
||||
if got := normalizeROCmCardKey(input); got != want {
|
||||
t.Fatalf("normalizeROCmCardKey(%q)=%q want %q", input, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,10 +4,27 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var execDmidecode = func(typeNum string) (string, error) {
|
||||
out, err := exec.Command("dmidecode", "-t", typeNum).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -61,6 +78,45 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
return board
|
||||
}
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
slog.Info("bmc: collected", "version", version)
|
||||
return []schema.HardwareFirmwareRecord{
|
||||
{DeviceName: "BMC", Version: version},
|
||||
}
|
||||
}
|
||||
|
||||
// parseBMCFirmwareRevision extracts the "Firmware Revision" field from ipmitool mc info output.
|
||||
func parseBMCFirmwareRevision(out string) string {
|
||||
for _, line := range strings.Split(out, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ":")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Firmware Revision" {
|
||||
return strings.TrimSpace(val)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseBIOSFirmware extracts BIOS version from dmidecode type 0 output.
|
||||
func parseBIOSFirmware(type0 string) []schema.HardwareFirmwareRecord {
|
||||
fields := parseDMIFields(type0, "BIOS Information")
|
||||
@@ -141,9 +197,5 @@ func cleanDMIValue(v string) string {
|
||||
|
||||
// runDmidecode executes dmidecode -t <typeNum> and returns its stdout.
|
||||
func runDmidecode(typeNum string) (string, error) {
|
||||
out, err := exec.Command("dmidecode", "-t", typeNum).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
return execDmidecode(typeNum)
|
||||
}
|
||||
|
||||
@@ -23,10 +23,9 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
|
||||
cpus, cpuFW := collectCPUs(snap.Board.SerialNumber)
|
||||
snap.CPUs = cpus
|
||||
snap.Firmware = append(snap.Firmware, cpuFW...)
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
snap.Memory = collectMemory()
|
||||
sensorDoc, err := readSensorsJSONDoc()
|
||||
@@ -37,7 +36,9 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
|
||||
@@ -3,42 +3,39 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectCPUs runs dmidecode -t 4 and reads microcode version from sysfs.
|
||||
func collectCPUs(boardSerial string) ([]schema.HardwareCPU, []schema.HardwareFirmwareRecord) {
|
||||
// collectCPUs runs dmidecode -t 4 and enriches CPUs with microcode from sysfs.
|
||||
func collectCPUs() []schema.HardwareCPU {
|
||||
out, err := runDmidecode("4")
|
||||
if err != nil {
|
||||
slog.Warn("cpu: dmidecode type 4 failed", "err", err)
|
||||
return nil, nil
|
||||
return nil
|
||||
}
|
||||
|
||||
cpus := parseCPUs(out, boardSerial)
|
||||
|
||||
var firmware []schema.HardwareFirmwareRecord
|
||||
cpus := parseCPUs(out)
|
||||
if mc := readMicrocode(); mc != "" {
|
||||
firmware = append(firmware, schema.HardwareFirmwareRecord{
|
||||
DeviceName: "CPU Microcode",
|
||||
Version: mc,
|
||||
})
|
||||
for i := range cpus {
|
||||
cpus[i].Firmware = &mc
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("cpu: collected", "count", len(cpus))
|
||||
return cpus, firmware
|
||||
return cpus
|
||||
}
|
||||
|
||||
// parseCPUs splits dmidecode output into per-processor sections and parses each.
|
||||
func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
||||
func parseCPUs(output string) []schema.HardwareCPU {
|
||||
sections := splitDMISections(output, "Processor Information")
|
||||
cpus := make([]schema.HardwareCPU, 0, len(sections))
|
||||
|
||||
for _, section := range sections {
|
||||
cpu, ok := parseCPUSection(section, boardSerial)
|
||||
cpu, ok := parseCPUSection(section)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
@@ -49,7 +46,7 @@ func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
||||
|
||||
// parseCPUSection parses one "Processor Information" block into a HardwareCPU.
|
||||
// Returns false if the socket is unpopulated.
|
||||
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
|
||||
func parseCPUSection(fields map[string]string) (schema.HardwareCPU, bool) {
|
||||
status := parseCPUStatus(fields["Status"])
|
||||
if status == statusEmpty {
|
||||
return schema.HardwareCPU{}, false
|
||||
@@ -72,11 +69,6 @@ func parseCPUSection(fields map[string]string, boardSerial string) (schema.Hardw
|
||||
}
|
||||
if v := cleanDMIValue(fields["Serial Number"]); v != "" {
|
||||
cpu.SerialNumber = &v
|
||||
} else if boardSerial != "" && cpu.Socket != nil {
|
||||
// Intel Xeon never exposes serial via DMI — generate stable fallback
|
||||
// matching core's generateCPUVendorSerial() logic
|
||||
fb := fmt.Sprintf("%s-CPU-%d", boardSerial, *cpu.Socket)
|
||||
cpu.SerialNumber = &fb
|
||||
}
|
||||
|
||||
if v := parseMHz(fields["Max Speed"]); v > 0 {
|
||||
@@ -180,7 +172,7 @@ func parseInt(v string) int {
|
||||
// readMicrocode reads the CPU microcode revision from sysfs.
|
||||
// Returns empty string if unavailable.
|
||||
func readMicrocode() string {
|
||||
data, err := os.ReadFile("/sys/devices/system/cpu/cpu0/microcode/version")
|
||||
data, err := os.ReadFile(filepath.Join(cpuSysBaseDir, "cpu0", "microcode", "version"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseCPUs_dual_socket(t *testing.T) {
|
||||
out := mustReadFile(t, "testdata/dmidecode_type4.txt")
|
||||
cpus := parseCPUs(out, "CAR315KA0803B90")
|
||||
cpus := parseCPUs(out)
|
||||
|
||||
if len(cpus) != 2 {
|
||||
t.Fatalf("expected 2 CPUs, got %d", len(cpus))
|
||||
@@ -37,23 +39,22 @@ func TestParseCPUs_dual_socket(t *testing.T) {
|
||||
if cpu0.Status == nil || *cpu0.Status != "OK" {
|
||||
t.Errorf("cpu0 status: got %v, want OK", cpu0.Status)
|
||||
}
|
||||
// Intel Xeon serial not available → fallback
|
||||
if cpu0.SerialNumber == nil || *cpu0.SerialNumber != "CAR315KA0803B90-CPU-0" {
|
||||
t.Errorf("cpu0 serial fallback: got %v, want CAR315KA0803B90-CPU-0", cpu0.SerialNumber)
|
||||
if cpu0.SerialNumber != nil {
|
||||
t.Errorf("cpu0 serial should stay nil without source data, got %v", cpu0.SerialNumber)
|
||||
}
|
||||
|
||||
cpu1 := cpus[1]
|
||||
if cpu1.Socket == nil || *cpu1.Socket != 1 {
|
||||
t.Errorf("cpu1 socket: got %v, want 1", cpu1.Socket)
|
||||
}
|
||||
if cpu1.SerialNumber == nil || *cpu1.SerialNumber != "CAR315KA0803B90-CPU-1" {
|
||||
t.Errorf("cpu1 serial fallback: got %v, want CAR315KA0803B90-CPU-1", cpu1.SerialNumber)
|
||||
if cpu1.SerialNumber != nil {
|
||||
t.Errorf("cpu1 serial should stay nil without source data, got %v", cpu1.SerialNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCPUs_unpopulated_skipped(t *testing.T) {
|
||||
out := mustReadFile(t, "testdata/dmidecode_type4_disabled.txt")
|
||||
cpus := parseCPUs(out, "BOARD-001")
|
||||
cpus := parseCPUs(out)
|
||||
|
||||
if len(cpus) != 1 {
|
||||
t.Fatalf("expected 1 CPU (unpopulated skipped), got %d", len(cpus))
|
||||
@@ -63,6 +64,39 @@ func TestParseCPUs_unpopulated_skipped(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestCollectCPUsSetsFirmwareFromMicrocode(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
origBase := cpuSysBaseDir
|
||||
cpuSysBaseDir = tmp
|
||||
t.Cleanup(func() { cpuSysBaseDir = origBase })
|
||||
|
||||
if err := os.MkdirAll(filepath.Join(tmp, "cpu0", "microcode"), 0755); err != nil {
|
||||
t.Fatalf("mkdir microcode dir: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "cpu0", "microcode", "version"), []byte("0x2b000643\n"), 0644); err != nil {
|
||||
t.Fatalf("write microcode version: %v", err)
|
||||
}
|
||||
|
||||
origRun := execDmidecode
|
||||
execDmidecode = func(typeNum string) (string, error) {
|
||||
if typeNum != "4" {
|
||||
t.Fatalf("unexpected dmidecode type: %s", typeNum)
|
||||
}
|
||||
return mustReadFile(t, "testdata/dmidecode_type4.txt"), nil
|
||||
}
|
||||
t.Cleanup(func() { execDmidecode = origRun })
|
||||
|
||||
cpus := collectCPUs()
|
||||
if len(cpus) != 2 {
|
||||
t.Fatalf("expected 2 CPUs, got %d", len(cpus))
|
||||
}
|
||||
for i, cpu := range cpus {
|
||||
if cpu.Firmware == nil || *cpu.Firmware != "0x2b000643" {
|
||||
t.Fatalf("cpu[%d] firmware=%v want microcode", i, cpu.Firmware)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCPUStatus(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
|
||||
@@ -2,16 +2,20 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
finalizeSnapshot(snap, collectedAt)
|
||||
}
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
snap.Storage = filterStorage(snap.Storage)
|
||||
snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
deduplicateComponentSerials(snap)
|
||||
}
|
||||
|
||||
func filterMemory(dimms []schema.HardwareMemory) []schema.HardwareMemory {
|
||||
@@ -37,15 +41,40 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
|
||||
continue
|
||||
}
|
||||
out = append(out, disk)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
out := make([]schema.HardwarePCIeDevice, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
|
||||
continue
|
||||
}
|
||||
out = append(out, dev)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
|
||||
hasIdentity := false
|
||||
switch {
|
||||
case psu.SerialNumber != nil && *psu.SerialNumber != "":
|
||||
hasIdentity = true
|
||||
case psu.Slot != nil && *psu.Slot != "":
|
||||
hasIdentity = true
|
||||
case psu.Model != nil && *psu.Model != "":
|
||||
hasIdentity = true
|
||||
case psu.Vendor != nil && *psu.Vendor != "":
|
||||
hasIdentity = true
|
||||
}
|
||||
if !hasIdentity {
|
||||
continue
|
||||
}
|
||||
out = append(out, psu)
|
||||
@@ -79,101 +108,3 @@ func setStatusCheckedAt(status *schema.HardwareComponentStatus, collectedAt stri
|
||||
status.StatusCheckedAt = &collectedAt
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateComponentSerials(snap *schema.HardwareSnapshot) {
|
||||
deduplicateCPUSerials(snap.CPUs)
|
||||
deduplicateMemorySerials(snap.Memory)
|
||||
deduplicateStorageSerials(snap.Storage)
|
||||
deduplicatePCIeSerials(snap.PCIeDevices)
|
||||
deduplicatePSUSerials(snap.PowerSupplies)
|
||||
}
|
||||
|
||||
func deduplicateCPUSerials(items []schema.HardwareCPU) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateMemorySerials(items []schema.HardwareMemory) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].PartNumber)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateStorageSerials(items []schema.HardwareStorage) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicatePCIeSerials(items []schema.HardwarePCIeDevice) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicatePSUSerials(items []schema.HardwarePowerSupply) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
present := true
|
||||
status := statusOK
|
||||
serial := "SN-1"
|
||||
virtualModel := "Virtual HDisk1"
|
||||
realModel := "PASCARI"
|
||||
coProcessorClass := "Co-processor"
|
||||
gpuClass := "VideoController"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Memory: []schema.HardwareMemory{
|
||||
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PowerSupplies: []schema.HardwarePowerSupply{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
@@ -31,15 +41,18 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||
}
|
||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||
}
|
||||
if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
|
||||
t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
|
||||
}
|
||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
|
||||
func TestFinalizeSnapshotPreservesDuplicateSerials(t *testing.T) {
|
||||
collectedAt := "2026-03-15T12:00:00Z"
|
||||
status := statusOK
|
||||
model := "Device"
|
||||
@@ -57,7 +70,24 @@ func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
|
||||
if got := *snap.Storage[0].SerialNumber; got != serial {
|
||||
t.Fatalf("first serial changed: %q", got)
|
||||
}
|
||||
if got := *snap.Storage[1].SerialNumber; got != "NO_SN-00000001" {
|
||||
t.Fatalf("duplicate serial mismatch: %q", got)
|
||||
if got := *snap.Storage[1].SerialNumber; got != serial {
|
||||
t.Fatalf("duplicate serial should stay unchanged: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterPSUsKeepsSlotOnlyEntries(t *testing.T) {
|
||||
slot := "0"
|
||||
status := statusOK
|
||||
|
||||
got := filterPSUs([]schema.HardwarePowerSupply{
|
||||
{Slot: &slot, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
})
|
||||
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||
t.Fatalf("unexpected kept PSU: %+v", got[0])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,18 +2,21 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
mstflintQuery = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -21,7 +24,7 @@ var (
|
||||
}
|
||||
|
||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -29,6 +32,14 @@ var (
|
||||
}
|
||||
|
||||
netIfacesByBDF = listNetIfacesByBDF
|
||||
readNetCarrierFile = func(iface string) (string, error) {
|
||||
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(string(raw)), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||
@@ -162,3 +173,9 @@ func listNetIfacesByBDF(bdf string) []string {
|
||||
}
|
||||
return ifaces
|
||||
}
|
||||
|
||||
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
return exec.CommandContext(ctx, name, args...).Output()
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
|
||||
var (
|
||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -44,6 +44,11 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
iface := ifaces[0]
|
||||
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
|
||||
if devs[i].SerialNumber == nil {
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
}
|
||||
}
|
||||
|
||||
if devs[i].Firmware == nil {
|
||||
if out, err := ethtoolInfoQuery(iface); err == nil {
|
||||
@@ -108,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
}
|
||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case key == "identifier":
|
||||
s := parseSFPIdentifier(val)
|
||||
dev.SFPIdentifier = &s
|
||||
t := true
|
||||
dev.SFPPresent = &t
|
||||
changed = true
|
||||
case key == "connector":
|
||||
s := parseSFPConnector(val)
|
||||
dev.SFPConnector = &s
|
||||
changed = true
|
||||
case key == "vendor name":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPVendor = &s
|
||||
changed = true
|
||||
case key == "vendor pn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPPartNumber = &s
|
||||
changed = true
|
||||
case key == "vendor sn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPSerialNumber = &s
|
||||
changed = true
|
||||
case strings.Contains(key, "laser wavelength"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPWavelengthNM = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "module temperature"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPTemperatureC = &f
|
||||
@@ -140,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
return changed
|
||||
}
|
||||
|
||||
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||
func parseSFPIdentifier(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||
// e.g. "0x07 (LC)" → "LC".
|
||||
func parseSFPConnector(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||
|
||||
func extractParens(s string) string {
|
||||
m := parenRe.FindStringSubmatch(s)
|
||||
if len(m) < 2 {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(m[1])
|
||||
}
|
||||
|
||||
func parseSFPDOM(raw string) map[string]any {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := map[string]any{}
|
||||
if dev.SFPPresent != nil {
|
||||
out["sfp_present"] = *dev.SFPPresent
|
||||
}
|
||||
if dev.SFPIdentifier != nil {
|
||||
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||
}
|
||||
if dev.SFPConnector != nil {
|
||||
out["sfp_connector"] = *dev.SFPConnector
|
||||
}
|
||||
if dev.SFPVendor != nil {
|
||||
out["sfp_vendor"] = *dev.SFPVendor
|
||||
}
|
||||
if dev.SFPPartNumber != nil {
|
||||
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||
}
|
||||
if dev.SFPSerialNumber != nil {
|
||||
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||
}
|
||||
if dev.SFPWavelengthNM != nil {
|
||||
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||
}
|
||||
if dev.SFPTemperatureC != nil {
|
||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||
}
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseSFPDOM(t *testing.T) {
|
||||
raw := `
|
||||
@@ -29,6 +33,110 @@ func TestParseSFPDOM(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLSPCIDetailSerial(t *testing.T) {
|
||||
raw := `
|
||||
05:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
|
||||
Serial number: NIC-SN-12345
|
||||
`
|
||||
if got := parseLSPCIDetailSerial(raw); got != "NIC-SN-12345" {
|
||||
t.Fatalf("serial=%q want %q", got, "NIC-SN-12345")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePCIVPDSerial(t *testing.T) {
|
||||
raw := []byte{0x82, 0x05, 0x00, 'M', 'L', 'X', '5', 0x90, 0x08, 0x00, 'S', 'N', 0x08, 'M', 'T', '1', '2', '3', '4', '5', '6'}
|
||||
if got := parsePCIVPDSerial(raw); got != "MT123456" {
|
||||
t.Fatalf("serial=%q want %q", got, "MT123456")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:18:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: NIC-SN-98765\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(iface string) (string, error) {
|
||||
if iface != "eth0" {
|
||||
t.Fatalf("unexpected iface: %s", iface)
|
||||
}
|
||||
return "aa:bb:cc:dd:ee:ff", nil
|
||||
}
|
||||
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "NIC-SN-98765" {
|
||||
t.Fatalf("serial=%v want NIC-SN-98765", out[0].SerialNumber)
|
||||
}
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDBMValue(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
|
||||
@@ -13,29 +13,34 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
Index int
|
||||
BDF string
|
||||
Name string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
// If the driver/tool is unavailable, NVIDIA devices get Unknown status and
|
||||
// a stable serial fallback based on board serial + slot.
|
||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
||||
// If the driver/tool is unavailable, NVIDIA devices get Unknown status.
|
||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
if !hasNVIDIADevices(devs) {
|
||||
return devs
|
||||
}
|
||||
gpuByBDF, err := queryNVIDIAGPUs()
|
||||
if err != nil {
|
||||
slog.Info("nvidia: enrichment skipped", "err", err)
|
||||
return enrichPCIeWithNVIDIAData(devs, nil, boardSerial, false)
|
||||
return enrichPCIeWithNVIDIAData(devs, nil, false)
|
||||
}
|
||||
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, boardSerial, true)
|
||||
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, true)
|
||||
}
|
||||
|
||||
func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
@@ -47,7 +52,7 @@ func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, boardSerial string, driverLoaded bool) []schema.HardwarePCIeDevice {
|
||||
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, driverLoaded bool) []schema.HardwarePCIeDevice {
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !isNVIDIADevice(devs[i]) {
|
||||
@@ -55,7 +60,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
}
|
||||
|
||||
if !driverLoaded {
|
||||
setPCIeFallback(&devs[i], boardSerial)
|
||||
setPCIeFallback(&devs[i])
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -65,14 +70,15 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
}
|
||||
info, ok := gpuByBDF[bdf]
|
||||
if !ok {
|
||||
setPCIeFallback(&devs[i], boardSerial)
|
||||
setPCIeFallback(&devs[i])
|
||||
continue
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(info.Name); v != "" {
|
||||
devs[i].Model = &v
|
||||
}
|
||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||
devs[i].SerialNumber = &v
|
||||
} else {
|
||||
setPCIeFallbackSerial(&devs[i], boardSerial)
|
||||
}
|
||||
if v := strings.TrimSpace(info.VBIOS); v != "" {
|
||||
devs[i].Firmware = &v
|
||||
@@ -97,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -121,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 14 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -131,14 +137,20 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
Index: parseRequiredInt(rec[0]),
|
||||
BDF: bdf,
|
||||
Name: strings.TrimSpace(rec[2]),
|
||||
Serial: strings.TrimSpace(rec[3]),
|
||||
VBIOS: strings.TrimSpace(rec[4]),
|
||||
TemperatureC: parseMaybeFloat(rec[5]),
|
||||
PowerW: parseMaybeFloat(rec[6]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||
HWSlowdown: parseMaybeBool(rec[9]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -170,6 +182,30 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseRequiredInt(v string) int {
|
||||
n, err := strconv.Atoi(strings.TrimSpace(v))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -213,27 +249,16 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
setPCIeFallbackSerial(dev, boardSerial)
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
status := statusUnknown
|
||||
dev.Status = &status
|
||||
}
|
||||
|
||||
func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
if strings.TrimSpace(boardSerial) == "" || dev.SerialNumber != nil {
|
||||
return
|
||||
}
|
||||
slot := "unknown"
|
||||
if dev.BDF != nil && strings.TrimSpace(*dev.BDF) != "" {
|
||||
slot = strings.TrimSpace(*dev.BDF)
|
||||
} else if dev.Slot != nil && strings.TrimSpace(*dev.Slot) != "" {
|
||||
slot = strings.TrimSpace(*dev.Slot)
|
||||
}
|
||||
fb := fmt.Sprintf("%s-PCIE-%s", boardSerial, slot)
|
||||
dev.SerialNumber = &fb
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
dev.Telemetry["nvidia_gpu_index"] = info.Index
|
||||
if info.TemperatureC != nil {
|
||||
dev.TemperatureC = info.TemperatureC
|
||||
}
|
||||
@@ -249,4 +274,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatalf("gpu by normalized bdf not found")
|
||||
}
|
||||
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||
t.Fatalf("name: got %q", gpu.Name)
|
||||
}
|
||||
if gpu.Serial != "GPU-SERIAL-1" {
|
||||
t.Fatalf("serial: got %q", gpu.Serial)
|
||||
}
|
||||
@@ -28,6 +31,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
@@ -73,13 +82,16 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
out := enrichPCIeWithNVIDIAData(devices, byBDF, "BOARD-001", true)
|
||||
out := enrichPCIeWithNVIDIAData(devices, byBDF, true)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-ABC" {
|
||||
t.Fatalf("serial: got %v", out[0].SerialNumber)
|
||||
}
|
||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||
}
|
||||
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
|
||||
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||
t.Fatalf("status: got %v", out[0].Status)
|
||||
}
|
||||
@@ -103,9 +115,9 @@ func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
out := enrichPCIeWithNVIDIAData(devices, nil, "BOARD-123", false)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
|
||||
t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
|
||||
out := enrichPCIeWithNVIDIAData(devices, nil, false)
|
||||
if out[0].SerialNumber != nil {
|
||||
t.Fatalf("serial should stay nil without source data, got %v", out[0].SerialNumber)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != statusUnknown {
|
||||
t.Fatalf("fallback status: got %v", out[0].Status)
|
||||
|
||||
@@ -2,6 +2,7 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
@@ -37,7 +38,7 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
val := strings.TrimSpace(line[idx+2:])
|
||||
fields[key] = val
|
||||
}
|
||||
if !shouldIncludePCIeDevice(fields["Class"]) {
|
||||
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
|
||||
continue
|
||||
}
|
||||
dev := parseLspciDevice(fields)
|
||||
@@ -46,8 +47,10 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldIncludePCIeDevice(class string) bool {
|
||||
func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
c := strings.ToLower(strings.TrimSpace(class))
|
||||
v := strings.ToLower(strings.TrimSpace(vendor))
|
||||
d := strings.ToLower(strings.TrimSpace(device))
|
||||
if c == "" {
|
||||
return true
|
||||
}
|
||||
@@ -57,6 +60,9 @@ func shouldIncludePCIeDevice(class string) bool {
|
||||
"host bridge",
|
||||
"isa bridge",
|
||||
"pci bridge",
|
||||
"co-processor",
|
||||
"performance counter",
|
||||
"performance counters",
|
||||
"ram memory",
|
||||
"system peripheral",
|
||||
"communication controller",
|
||||
@@ -66,12 +72,47 @@ func shouldIncludePCIeDevice(class string) bool {
|
||||
"audio device",
|
||||
"serial bus controller",
|
||||
"unassigned class",
|
||||
"non-essential instrumentation",
|
||||
}
|
||||
for _, bad := range excluded {
|
||||
if strings.Contains(c, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||
bmcPatterns := []string{
|
||||
"management system chip",
|
||||
"management controller",
|
||||
"ibmc",
|
||||
"idrac",
|
||||
"ilo vga",
|
||||
"aspeed",
|
||||
"matrox",
|
||||
}
|
||||
for _, bad := range bmcPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
"reserved spp",
|
||||
"ptdma",
|
||||
"cryptographic coprocessor pspcpp",
|
||||
"pspcpp",
|
||||
}
|
||||
for _, bad := range internalAMDPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -84,6 +125,7 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// Slot is the BDF: "0000:00:02.0"
|
||||
if bdf := fields["Slot"]; bdf != "" {
|
||||
dev.Slot = &bdf
|
||||
dev.BDF = &bdf
|
||||
// parse vendor_id and device_id from sysfs
|
||||
vendorID, deviceID := readPCIIDs(bdf)
|
||||
@@ -95,6 +137,8 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
@@ -129,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
}
|
||||
|
||||
@@ -162,6 +209,18 @@ func readPCINumaNode(bdf string) (int, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
func parsePCINumaNode(raw string) (int, bool) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return 0, false
|
||||
}
|
||||
value, err := strconv.Atoi(raw)
|
||||
if err != nil || value < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
||||
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||
if err != nil {
|
||||
@@ -186,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
||||
// Returns 0 for unrecognised values so comparisons fail safe.
|
||||
func pcieLinkSpeedRank(gen string) int {
|
||||
switch gen {
|
||||
case "Gen1":
|
||||
return 1
|
||||
case "Gen2":
|
||||
return 2
|
||||
case "Gen3":
|
||||
return 3
|
||||
case "Gen4":
|
||||
return 4
|
||||
case "Gen5":
|
||||
return 5
|
||||
case "Gen6":
|
||||
return 6
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func normalizePCILinkSpeed(raw string) string {
|
||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||
switch {
|
||||
|
||||
@@ -1,34 +1,53 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
tests := []struct {
|
||||
class string
|
||||
want bool
|
||||
name string
|
||||
class string
|
||||
vendor string
|
||||
device string
|
||||
want bool
|
||||
}{
|
||||
{"USB controller", false},
|
||||
{"System peripheral", false},
|
||||
{"Audio device", false},
|
||||
{"Host bridge", false},
|
||||
{"PCI bridge", false},
|
||||
{"SMBus", false},
|
||||
{"Ethernet controller", true},
|
||||
{"RAID bus controller", true},
|
||||
{"Non-Volatile memory controller", true},
|
||||
{"VGA compatible controller", true},
|
||||
{name: "usb", class: "USB controller", want: false},
|
||||
{name: "system peripheral", class: "System peripheral", want: false},
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "co-processor", class: "Co-processor", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
{name: "amd dummy function", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse PTDMA", want: false},
|
||||
{name: "amd pspcpp", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse Cryptographic Coprocessor PSPCPP", want: false},
|
||||
{name: "ethernet", class: "Ethernet controller", want: true},
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := shouldIncludePCIeDevice(tt.class)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class %q include=%v want %v", tt.class, got, tt.want)
|
||||
}
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := shouldIncludePCIeDevice(tt.class, tt.vendor, tt.device)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class=%q vendor=%q device=%q include=%v want %v", tt.class, tt.vendor, tt.device, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" +
|
||||
"Slot:\t0000:00:18.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
@@ -38,6 +57,70 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VideoController" {
|
||||
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
|
||||
}
|
||||
if devs[0].Slot == nil || *devs[0].Slot != "0000:65:00.0" {
|
||||
t.Fatalf("slot: got %v", devs[0].Slot)
|
||||
}
|
||||
if devs[0].BDF == nil || *devs[0].BDF != "0000:65:00.0" {
|
||||
t.Fatalf("bdf: got %v", devs[0].BDF)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:1a:00.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:1a:00.2\nClass:\tEncryption controller\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PTDMA\n\n" +
|
||||
"Slot:\t0000:05:00.0\nClass:\tEthernet controller\nVendor:\tMellanox Technologies\nDevice:\tMT28908 Family [ConnectX-6]\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "MT28908 Family [ConnectX-6]" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersCoProcessors(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "H100" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
data, err := json.Marshal(devs[0])
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
if !strings.Contains(text, `"slot":"0000:65:00.0"`) {
|
||||
t.Fatalf("json missing slot: %s", text)
|
||||
}
|
||||
if strings.Contains(text, `"bdf"`) {
|
||||
t.Fatalf("json should not emit bdf: %s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspciUsesNUMANodeFieldWhenSysfsUnavailable(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tEthernet controller\nVendor:\tIntel Corporation\nDevice:\tX710\nNUMANode:\t1\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].NUMANode == nil || *devs[0].NUMANode != 1 {
|
||||
t.Fatalf("numa_node=%v want 1", devs[0].NUMANode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
@@ -59,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
|
||||
ptr := func(s string) *string { return &s }
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
linkSpeed *string
|
||||
maxSpeed *string
|
||||
wantWarning bool
|
||||
wantGenIn string // substring expected in ErrorDescription when warning
|
||||
}{
|
||||
{
|
||||
name: "degraded Gen1 vs Gen5",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen1",
|
||||
},
|
||||
{
|
||||
name: "at max Gen5",
|
||||
linkSpeed: ptr("Gen5"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "degraded Gen4 vs Gen5",
|
||||
linkSpeed: ptr("Gen4"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen4",
|
||||
},
|
||||
{
|
||||
name: "missing current speed — no warning",
|
||||
linkSpeed: nil,
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "missing max speed — no warning",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: nil,
|
||||
wantWarning: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
ok := statusOK
|
||||
dev.Status = &ok
|
||||
dev.LinkSpeed = tt.linkSpeed
|
||||
dev.MaxLinkSpeed = tt.maxSpeed
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
gotWarn := dev.Status != nil && *dev.Status == statusWarning
|
||||
if gotWarn != tt.wantWarning {
|
||||
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
|
||||
}
|
||||
if tt.wantWarning {
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("expected ErrorDescription to be set")
|
||||
}
|
||||
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
|
||||
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
|
||||
}
|
||||
} else {
|
||||
if dev.ErrorDescription != nil {
|
||||
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
123
audit/internal/collector/pcie_identity.go
Normal file
123
audit/internal/collector/pcie_identity.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("lspci", "-vv", "-s", bdf).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
readPCIVPDFile = func(bdf string) ([]byte, error) {
|
||||
return os.ReadFile(filepath.Join("/sys/bus/pci/devices", bdf, "vpd"))
|
||||
}
|
||||
)
|
||||
|
||||
func enrichPCIeWithPCISerials(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !shouldProbePCIeSerial(devs[i]) {
|
||||
continue
|
||||
}
|
||||
bdf := normalizePCIeBDF(*devs[i].BDF)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
enriched++
|
||||
}
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("pcie: serials enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldProbePCIeSerial(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.BDF == nil || dev.SerialNumber != nil {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
return isNICClass(class) || isGPUClass(class)
|
||||
}
|
||||
|
||||
func queryPCIDeviceSerial(bdf string) string {
|
||||
if out, err := queryPCILSPCIDetail(bdf); err == nil {
|
||||
if serial := parseLSPCIDetailSerial(out); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
if raw, err := readPCIVPDFile(bdf); err == nil {
|
||||
return parsePCIVPDSerial(raw)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseLSPCIDetailSerial(raw string) string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
lower := strings.ToLower(line)
|
||||
if !strings.Contains(lower, "serial number:") {
|
||||
continue
|
||||
}
|
||||
idx := strings.Index(line, ":")
|
||||
if idx < 0 {
|
||||
continue
|
||||
}
|
||||
if serial := strings.TrimSpace(line[idx+1:]); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parsePCIVPDSerial(raw []byte) string {
|
||||
for i := 0; i+3 < len(raw); i++ {
|
||||
if raw[i] != 'S' || raw[i+1] != 'N' {
|
||||
continue
|
||||
}
|
||||
length := int(raw[i+2])
|
||||
if length <= 0 || length > 64 || i+3+length > len(raw) {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Trim(string(raw[i+3:i+3+length]), "\x00"))
|
||||
if !looksLikeSerial(value) {
|
||||
continue
|
||||
}
|
||||
return value
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeSerial(value string) bool {
|
||||
if len(value) < 4 {
|
||||
return false
|
||||
}
|
||||
hasAlphaNum := false
|
||||
for _, r := range value {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9':
|
||||
hasAlphaNum = true
|
||||
case strings.ContainsRune(" -_./:", r):
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return hasAlphaNum
|
||||
}
|
||||
47
audit/internal/collector/pcie_identity_test.go
Normal file
47
audit/internal/collector/pcie_identity_test.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEnrichPCIeWithPCISerialsAddsGPUFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:11:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: GPU-SN-12345\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
bdf := "0000:11:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithPCISerials(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-SN-12345" {
|
||||
t.Fatalf("serial=%v want GPU-SN-12345", out[0].SerialNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldProbePCIeSerialSkipsNonGPUOrNIC(t *testing.T) {
|
||||
class := "StorageController"
|
||||
bdf := "0000:19:00.0"
|
||||
dev := schema.HardwarePCIeDevice{DeviceClass: &class, BDF: &bdf}
|
||||
if shouldProbePCIeSerial(dev) {
|
||||
t.Fatal("unexpected probe for storage controller")
|
||||
}
|
||||
}
|
||||
@@ -5,21 +5,31 @@ import (
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
// ipmitool requires /dev/ipmi0 — not available on non-server hardware
|
||||
out, err := exec.Command("ipmitool", "fru", "print").Output()
|
||||
if err != nil {
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
} else {
|
||||
mergePSUSDR(psus, sdrData)
|
||||
}
|
||||
} else if len(psus) == 0 {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
psus := parseFRU(string(out))
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
mergePSUSDR(psus, parsePSUSDR(string(sdrOut)))
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
return psus
|
||||
}
|
||||
@@ -79,9 +89,7 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
|
||||
// Only process PSU FRU records
|
||||
headerLower := strings.ToLower(header)
|
||||
if !strings.Contains(headerLower, "psu") &&
|
||||
!strings.Contains(headerLower, "power supply") &&
|
||||
!strings.Contains(headerLower, "power_supply") {
|
||||
if !isPSUHeader(headerLower) {
|
||||
return schema.HardwarePowerSupply{}, false
|
||||
}
|
||||
|
||||
@@ -89,21 +97,24 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
psu := schema.HardwarePowerSupply{Present: &present}
|
||||
|
||||
slotStr := strconv.Itoa(slotIdx)
|
||||
if slot, ok := parsePSUSlot(header); ok && slot > 0 {
|
||||
slotStr = strconv.Itoa(slot - 1)
|
||||
}
|
||||
psu.Slot = &slotStr
|
||||
|
||||
if v := cleanDMIValue(fields["Board Product"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Product", "Product Name", "Product Part Number"); v != "" {
|
||||
psu.Model = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Mfg"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Mfg", "Product Manufacturer", "Product Manufacturer Name"); v != "" {
|
||||
psu.Vendor = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Serial"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Serial", "Product Serial", "Product Serial Number"); v != "" {
|
||||
psu.SerialNumber = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Part Number"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Part Number", "Product Part Number", "Part Number"); v != "" {
|
||||
psu.PartNumber = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Extra"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Extra", "Product Version", "Board Version"); v != "" {
|
||||
psu.Firmware = &v
|
||||
}
|
||||
|
||||
@@ -120,6 +131,23 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
return psu, true
|
||||
}
|
||||
|
||||
func isPSUHeader(headerLower string) bool {
|
||||
return strings.Contains(headerLower, "psu") ||
|
||||
strings.Contains(headerLower, "pws") ||
|
||||
strings.Contains(headerLower, "power supply") ||
|
||||
strings.Contains(headerLower, "power_supply") ||
|
||||
strings.Contains(headerLower, "power module")
|
||||
}
|
||||
|
||||
func firstNonEmptyField(fields map[string]string, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
if value := cleanDMIValue(fields[key]); value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
type psuSDR struct {
|
||||
slot int
|
||||
status string
|
||||
@@ -131,7 +159,56 @@ type psuSDR struct {
|
||||
healthPct *float64
|
||||
}
|
||||
|
||||
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
|
||||
var psuSlotPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
||||
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
||||
// Must be last: "power supply N" is already caught by the pattern above.
|
||||
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
||||
}
|
||||
|
||||
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_IN, PSU1_PIN
|
||||
// MLT: PSU1_PIN
|
||||
// xFusion: (matched via default fallback — no explicit keyword)
|
||||
// HPE: PS1 Input Power, PS1 Input Watts
|
||||
func isPSUInputPower(name string) bool {
|
||||
return strings.Contains(name, "input power") ||
|
||||
strings.Contains(name, "input watts") ||
|
||||
strings.Contains(name, "_pin") ||
|
||||
strings.Contains(name, " pin") ||
|
||||
strings.Contains(name, "_power_in") ||
|
||||
strings.Contains(name, "power_in")
|
||||
}
|
||||
|
||||
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
||||
// MSI: PSU1_POWER_OUT
|
||||
// MLT: PSU1_POUT
|
||||
// xFusion: PS1 POut
|
||||
func isPSUOutputPower(name string) bool {
|
||||
return strings.Contains(name, "output power") ||
|
||||
strings.Contains(name, "output watts") ||
|
||||
strings.Contains(name, "_pout") ||
|
||||
strings.Contains(name, " pout") ||
|
||||
strings.Contains(name, "_power_out") ||
|
||||
strings.Contains(name, "power_out") ||
|
||||
strings.Contains(name, "power supply bay") ||
|
||||
strings.Contains(name, "psu bay")
|
||||
}
|
||||
|
||||
// parseBoundedFloat parses a numeric value from an SDR value field and
|
||||
// validates it is within (0, max]. Returns nil for zero, negative, or
|
||||
// out-of-range values — these indicate missing/off/fault sensor readings.
|
||||
func parseBoundedFloat(raw string, max float64) *float64 {
|
||||
v := parseFloatPtr(raw)
|
||||
if v == nil || *v <= 0 || *v > max {
|
||||
return nil
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
out := map[int]psuSDR{}
|
||||
@@ -160,22 +237,102 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
|
||||
lowerName := strings.ToLower(name)
|
||||
switch {
|
||||
case strings.Contains(lowerName, "input power"):
|
||||
entry.inputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "output power"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case isPSUInputPower(lowerName):
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
case isPSUOutputPower(lowerName):
|
||||
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||
entry.inputVoltage = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "temp"):
|
||||
entry.temperatureC = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||
entry.healthPct = parsePercentPtr(value)
|
||||
default:
|
||||
// Generic PSU power reading: sensor matched a slot pattern but carries
|
||||
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
||||
// AC input if the value looks like wattage and no better data is set yet.
|
||||
if entry.inputPowerW == nil {
|
||||
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||
}
|
||||
}
|
||||
out[slot] = entry
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// PSUSlotPower holds SDR power readings for one PSU slot.
|
||||
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
||||
// matching HardwarePowerSupply.Slot in the audit schema.
|
||||
type PSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"`
|
||||
OutputW *float64 `json:"output_w,omitempty"`
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
||||
// using the same battle-tested slot patterns as the hardware audit collector.
|
||||
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
||||
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
||||
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
||||
sdr := parsePSUSDR(sdrOutput)
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]PSUSlotPower, len(sdr))
|
||||
for slot, entry := range sdr {
|
||||
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
||||
out[key] = PSUSlotPower{
|
||||
InputW: entry.inputPowerW,
|
||||
OutputW: entry.outputPowerW,
|
||||
Status: entry.status,
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := make([]int, 0, len(sdr))
|
||||
for slot := range sdr {
|
||||
slots = append(slots, slot)
|
||||
}
|
||||
sort.Ints(slots)
|
||||
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(slots))
|
||||
for _, slot := range slots {
|
||||
entry := sdr[slot]
|
||||
present := true
|
||||
status := entry.status
|
||||
if status == "" {
|
||||
status = statusUnknown
|
||||
}
|
||||
slotStr := strconv.Itoa(slot - 1)
|
||||
model := "PSU"
|
||||
psu := schema.HardwarePowerSupply{
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
Slot: &slotStr,
|
||||
Present: &present,
|
||||
Model: &model,
|
||||
InputPowerW: entry.inputPowerW,
|
||||
OutputPowerW: entry.outputPowerW,
|
||||
InputVoltage: entry.inputVoltage,
|
||||
TemperatureC: entry.temperatureC,
|
||||
}
|
||||
if entry.healthPct != nil {
|
||||
psu.LifeRemainingPct = entry.healthPct
|
||||
lifeUsed := 100 - *entry.healthPct
|
||||
psu.LifeUsedPct = &lifeUsed
|
||||
}
|
||||
if entry.reason != "" {
|
||||
psu.ErrorDescription = &entry.reason
|
||||
}
|
||||
out = append(out, psu)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
|
||||
for i := range psus {
|
||||
slotIdx, err := strconv.Atoi(derefPSUSlot(psus[i].Slot))
|
||||
@@ -231,17 +388,19 @@ func splitSDRFields(line string) []string {
|
||||
}
|
||||
|
||||
func parsePSUSlot(name string) (int, bool) {
|
||||
m := psuSlotRe.FindStringSubmatch(strings.ToLower(name))
|
||||
if len(m) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
for _, group := range m[1:] {
|
||||
if group == "" {
|
||||
for _, re := range psuSlotPatterns {
|
||||
m := re.FindStringSubmatch(strings.ToLower(name))
|
||||
if len(m) == 0 {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(group)
|
||||
if err == nil && n > 0 {
|
||||
return n, true
|
||||
for _, group := range m[1:] {
|
||||
if group == "" {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(group)
|
||||
if err == nil && n > 0 {
|
||||
return n, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
|
||||
@@ -38,3 +38,54 @@ PS2 Input Power | 0 Watts | cr
|
||||
t.Fatalf("ps2 status=%q", got[2].status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
want int
|
||||
}{
|
||||
{name: "PWS1 Status", want: 1},
|
||||
{name: "Power Supply Bay 8", want: 8},
|
||||
{name: "PS 6 Input Power", want: 6},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got, ok := parsePSUSlot(tt.name)
|
||||
if !ok || got != tt.want {
|
||||
t.Fatalf("parsePSUSlot(%q)=(%d,%v) want (%d,true)", tt.name, got, ok, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
health := 97.0
|
||||
outputPower := 915.0
|
||||
got := synthesizePSUsFromSDR(map[int]psuSDR{
|
||||
1: {
|
||||
slot: 1,
|
||||
status: statusOK,
|
||||
outputPowerW: &outputPower,
|
||||
healthPct: &health,
|
||||
},
|
||||
})
|
||||
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||
t.Fatalf("slot=%v want 0", got[0].Slot)
|
||||
}
|
||||
if got[0].OutputPowerW == nil || *got[0].OutputPowerW != 915 {
|
||||
t.Fatalf("output power=%v", got[0].OutputPowerW)
|
||||
}
|
||||
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 97 {
|
||||
t.Fatalf("life remaining=%v", got[0].LifeRemainingPct)
|
||||
}
|
||||
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 3 {
|
||||
t.Fatalf("life used=%v", got[0].LifeUsedPct)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,19 +113,8 @@ func isLikelyPSUTemp(chip, feature string) bool {
|
||||
|
||||
func detectPSUSlot(parts ...string) (string, bool) {
|
||||
for _, part := range parts {
|
||||
lower := strings.ToLower(part)
|
||||
matches := psuSlotRe.FindStringSubmatch(lower)
|
||||
if len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
for _, group := range matches[1:] {
|
||||
if group == "" {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.Atoi(group)
|
||||
if err == nil && value > 0 {
|
||||
return strconv.Itoa(value - 1), true
|
||||
}
|
||||
if value, ok := parsePSUSlot(part); ok && value > 0 {
|
||||
return strconv.Itoa(value - 1), true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
|
||||
@@ -5,11 +5,13 @@ import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectStorage() []schema.HardwareStorage {
|
||||
devs := lsblkDevices()
|
||||
devs := discoverStorageDevices()
|
||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
var s schema.HardwareStorage
|
||||
@@ -39,6 +41,64 @@ type lsblkRoot struct {
|
||||
Blockdevices []lsblkDevice `json:"blockdevices"`
|
||||
}
|
||||
|
||||
type nvmeListRoot struct {
|
||||
Devices []nvmeListDevice `json:"Devices"`
|
||||
}
|
||||
|
||||
type nvmeListDevice struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
ModelNumber string `json:"ModelNumber"`
|
||||
SerialNumber string `json:"SerialNumber"`
|
||||
Firmware string `json:"Firmware"`
|
||||
PhysicalSize int64 `json:"PhysicalSize"`
|
||||
}
|
||||
|
||||
func discoverStorageDevices() []lsblkDevice {
|
||||
merged := map[string]lsblkDevice{}
|
||||
for _, dev := range lsblkDevices() {
|
||||
if dev.Name == "" {
|
||||
continue
|
||||
}
|
||||
merged[dev.Name] = dev
|
||||
}
|
||||
for _, dev := range nvmeListDevices() {
|
||||
if dev.Name == "" {
|
||||
continue
|
||||
}
|
||||
current := merged[dev.Name]
|
||||
merged[dev.Name] = mergeStorageDevice(current, dev)
|
||||
}
|
||||
|
||||
disks := make([]lsblkDevice, 0, len(merged))
|
||||
for _, dev := range merged {
|
||||
if dev.Type == "" {
|
||||
dev.Type = "disk"
|
||||
}
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
if isVirtualBMCDisk(dev) {
|
||||
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||
continue
|
||||
}
|
||||
disks = append(disks, dev)
|
||||
}
|
||||
return disks
|
||||
}
|
||||
|
||||
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||
// These have zero reported size, a generic fake serial, and a model name that
|
||||
// starts with "Virtual HDisk".
|
||||
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||
return isVirtualHDiskModel(dev.Model)
|
||||
}
|
||||
|
||||
func isVirtualHDiskModel(model string) bool {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
return strings.HasPrefix(model, "virtual hdisk")
|
||||
}
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
@@ -60,6 +120,59 @@ func lsblkDevices() []lsblkDevice {
|
||||
return disks
|
||||
}
|
||||
|
||||
func nvmeListDevices() []lsblkDevice {
|
||||
out, err := exec.Command("nvme", "list", "-o", "json").Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var root nvmeListRoot
|
||||
if err := json.Unmarshal(out, &root); err != nil {
|
||||
slog.Warn("storage: nvme list parse failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
devices := make([]lsblkDevice, 0, len(root.Devices))
|
||||
for _, dev := range root.Devices {
|
||||
name := filepath.Base(strings.TrimSpace(dev.DevicePath))
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, lsblkDevice{
|
||||
Name: name,
|
||||
Type: "disk",
|
||||
Size: strconv.FormatInt(dev.PhysicalSize, 10),
|
||||
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||
Model: strings.TrimSpace(dev.ModelNumber),
|
||||
Tran: "nvme",
|
||||
})
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
func mergeStorageDevice(existing, incoming lsblkDevice) lsblkDevice {
|
||||
if existing.Name == "" {
|
||||
return incoming
|
||||
}
|
||||
if existing.Type == "" {
|
||||
existing.Type = incoming.Type
|
||||
}
|
||||
if strings.TrimSpace(existing.Size) == "" {
|
||||
existing.Size = incoming.Size
|
||||
}
|
||||
if strings.TrimSpace(existing.Serial) == "" {
|
||||
existing.Serial = incoming.Serial
|
||||
}
|
||||
if strings.TrimSpace(existing.Model) == "" {
|
||||
existing.Model = incoming.Model
|
||||
}
|
||||
if strings.TrimSpace(existing.Tran) == "" {
|
||||
existing.Tran = incoming.Tran
|
||||
}
|
||||
if strings.TrimSpace(existing.Hctl) == "" {
|
||||
existing.Hctl = incoming.Hctl
|
||||
}
|
||||
return existing
|
||||
}
|
||||
|
||||
// smartctlInfo is the subset of smartctl -j -a output we care about.
|
||||
type smartctlInfo struct {
|
||||
ModelFamily string `json:"model_family"`
|
||||
@@ -94,6 +207,7 @@ type smartctlInfo struct {
|
||||
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -252,9 +366,22 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Present: &present,
|
||||
Type: &devType,
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||
s.Model = &v
|
||||
}
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Serial)); v != "" {
|
||||
s.SerialNumber = &v
|
||||
}
|
||||
if size := parseStorageBytes(dev.Size); size > 0 {
|
||||
gb := int(size / 1_000_000_000)
|
||||
if gb > 0 {
|
||||
s.SizeGB = &gb
|
||||
}
|
||||
}
|
||||
|
||||
// id-ctrl: model, serial, firmware, capacity
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
@@ -335,6 +462,14 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
return s
|
||||
}
|
||||
|
||||
func parseStorageBytes(raw string) int64 {
|
||||
value, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
|
||||
if err == nil && value > 0 {
|
||||
return value
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func nvmeDataUnitsToBytes(units int64) int64 {
|
||||
if units <= 0 {
|
||||
return 0
|
||||
|
||||
33
audit/internal/collector/storage_discovery_test.go
Normal file
33
audit/internal/collector/storage_discovery_test.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := mergeStorageDevice(
|
||||
lsblkDevice{Name: "nvme0n1", Type: "disk", Tran: "nvme"},
|
||||
lsblkDevice{Name: "nvme0n1", Type: "disk", Size: "1024", Serial: "SN123", Model: "Kioxia"},
|
||||
)
|
||||
|
||||
if got.Serial != "SN123" {
|
||||
t.Fatalf("serial=%q want SN123", got.Serial)
|
||||
}
|
||||
if got.Model != "Kioxia" {
|
||||
t.Fatalf("model=%q want Kioxia", got.Model)
|
||||
}
|
||||
if got.Size != "1024" {
|
||||
t.Fatalf("size=%q want 1024", got.Size)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageBytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := parseStorageBytes(" 2048 "); got != 2048 {
|
||||
t.Fatalf("parseStorageBytes=%d want 2048", got)
|
||||
}
|
||||
if got := parseStorageBytes("1.92 TB"); got != 0 {
|
||||
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||
}
|
||||
}
|
||||
4534
audit/internal/platform/benchmark.go
Normal file
4534
audit/internal/platform/benchmark.go
Normal file
File diff suppressed because it is too large
Load Diff
554
audit/internal/platform/benchmark_report.go
Normal file
554
audit/internal/platform/benchmark_report.go
Normal file
@@ -0,0 +1,554 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||
return renderBenchmarkReportWithCharts(result)
|
||||
}
|
||||
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
|
||||
// ── Header ────────────────────────────────────────────────────────────────
|
||||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||
|
||||
// System identity block
|
||||
if result.ServerModel != "" {
|
||||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||
}
|
||||
if result.Hostname != "" {
|
||||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||
}
|
||||
// GPU models summary
|
||||
if len(result.GPUs) > 0 {
|
||||
modelCount := make(map[string]int)
|
||||
var modelOrder []string
|
||||
for _, g := range result.GPUs {
|
||||
m := strings.TrimSpace(g.Name)
|
||||
if m == "" {
|
||||
m = "Unknown GPU"
|
||||
}
|
||||
if modelCount[m] == 0 {
|
||||
modelOrder = append(modelOrder, m)
|
||||
}
|
||||
modelCount[m]++
|
||||
}
|
||||
var parts []string
|
||||
for _, m := range modelOrder {
|
||||
if modelCount[m] == 1 {
|
||||
parts = append(parts, m)
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||
if result.RampRunID != "" {
|
||||
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||||
}
|
||||
} else if result.ParallelGPUs {
|
||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||
}
|
||||
if result.ScalabilityScore > 0 {
|
||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||
}
|
||||
if result.PlatformPowerScore > 0 {
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||
if len(result.Findings) > 0 {
|
||||
b.WriteString("## Executive Summary\n\n")
|
||||
for _, finding := range result.Findings {
|
||||
fmt.Fprintf(&b, "- %s\n", finding)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
if len(result.Warnings) > 0 {
|
||||
b.WriteString("## Warnings\n\n")
|
||||
for _, warning := range result.Warnings {
|
||||
fmt.Fprintf(&b, "- %s\n", warning)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||||
b.WriteString("## Balanced Scorecard\n\n")
|
||||
|
||||
// Perspective 1: Compatibility — hard stops
|
||||
b.WriteString("### 1. Compatibility\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fanAtThrottle := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
ecc := "-"
|
||||
if gpu.ECC.Uncorrected > 0 {
|
||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||
}
|
||||
compatStatus := "✓ OK"
|
||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||
compatStatus = "⛔ HARD STOP"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||
fmt.Sprintf("%.1f°C", headroom),
|
||||
throttlePct,
|
||||
thermalStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 3: Power delivery
|
||||
b.WriteString("### 3. Power Delivery\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||
}
|
||||
fanDuty := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
powerStatus := "✓ OK"
|
||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||
powerStatus = "⚠ POWER LIMITED"
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
powerCap,
|
||||
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||
fanDuty,
|
||||
powerStatus,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 4: Performance
|
||||
b.WriteString("### 4. Performance\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||
synthetic, mixed, mixedEff, topsPerSM,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Perspective 5: Anomaly flags
|
||||
b.WriteString("### 5. Anomalies\n\n")
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||
}
|
||||
syncBoost := "-"
|
||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
}
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||
|
||||
// Identity
|
||||
if gpu.BusID != "" {
|
||||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||
}
|
||||
if gpu.VBIOS != "" {
|
||||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||
}
|
||||
if gpu.ComputeCapability != "" {
|
||||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||
}
|
||||
if gpu.MultiprocessorCount > 0 {
|
||||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||
}
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.PowerLimitDerated {
|
||||
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||||
}
|
||||
if gpu.CalibratedPeakPowerW > 0 {
|
||||
if gpu.CalibratedPeakTempC > 0 {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||
}
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"", "Avg", "P95"},
|
||||
[][]string{
|
||||
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||
}
|
||||
|
||||
// Per-precision stability phases.
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
b.WriteString("**Per-precision stability:**\n\n")
|
||||
var precRows [][]string
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
eccCorr := "—"
|
||||
eccUncorr := "—"
|
||||
if !p.ECC.IsZero() {
|
||||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||
}
|
||||
status := p.Status
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = "OK"
|
||||
}
|
||||
precRows = append(precRows, []string{
|
||||
p.Precision, status,
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||
eccCorr, eccUncorr,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
// Legacy: show combined-window variance.
|
||||
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||||
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||||
}
|
||||
|
||||
// ECC summary
|
||||
if !gpu.ECC.IsZero() {
|
||||
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||||
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||||
}
|
||||
|
||||
// Throttle
|
||||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||
if throttle != "none" {
|
||||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||
}
|
||||
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
var presRows [][]string
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
presRows = append(presRows, []string{
|
||||
p.Name,
|
||||
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||
fmt.Sprintf("×%.3g", p.Weight),
|
||||
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||
fmt.Sprintf("%d", p.Lanes),
|
||||
fmt.Sprintf("%d", p.Iterations),
|
||||
})
|
||||
} else {
|
||||
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||
}
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Degradation / Notes
|
||||
if len(gpu.DegradationReasons) > 0 {
|
||||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||
}
|
||||
if len(gpu.Notes) > 0 {
|
||||
b.WriteString("**Notes:**\n\n")
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||
if result.Interconnect != nil {
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Avg", "Max"},
|
||||
[][]string{
|
||||
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(result.Interconnect.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server Power (IPMI)\n\n")
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
} else {
|
||||
spRows := [][]string{
|
||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||
}
|
||||
if sp.ReportingRatio > 0 {
|
||||
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(sp.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||
for _, issue := range result.PSUIssues {
|
||||
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
if cooling.Available {
|
||||
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||
if cooling.FanDutyCycleAvailable {
|
||||
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Value"},
|
||||
[][]string{
|
||||
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||
{"Average fan duty cycle", dutyAvg},
|
||||
{"P95 fan duty cycle", dutyP95},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||
}
|
||||
for _, note := range cooling.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(cooling.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||
var scalRows [][]string
|
||||
for _, step := range result.PerformanceRampSteps {
|
||||
scalRows = append(scalRows, []string{
|
||||
fmt.Sprintf("%d", step.StepIndex),
|
||||
joinIndexList(step.GPUIndices),
|
||||
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||||
if result.Interconnect != nil {
|
||||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||
// duration is unknown (0), raw seconds are shown instead.
|
||||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||
type counter struct {
|
||||
label string
|
||||
us uint64
|
||||
}
|
||||
counters := []counter{
|
||||
{"sw_power", t.SWPowerCapUS},
|
||||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||
{"sync_boost", t.SyncBoostUS},
|
||||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||
}
|
||||
var parts []string
|
||||
for _, c := range counters {
|
||||
if c.us == 0 {
|
||||
continue
|
||||
}
|
||||
sec := float64(c.us) / 1e6
|
||||
if steadyDurationSec > 0 {
|
||||
pct := sec / steadyDurationSec * 100
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||
} else if sec < 1 {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||
}
|
||||
}
|
||||
if len(parts) == 0 {
|
||||
return "none"
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||||
var best float64
|
||||
for i, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||||
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||||
if i == 0 || gpu.Scores.CompositeScore > best {
|
||||
best = gpu.Scores.CompositeScore
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||||
if result.Interconnect != nil {
|
||||
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||||
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||
// is readable as plain text without a markdown renderer.
|
||||
//
|
||||
// headers contains the column header strings.
|
||||
// rows contains data rows; each row must have the same number of cells as headers.
|
||||
// Cells with fewer entries than headers are treated as empty.
|
||||
func fmtMDTable(headers []string, rows [][]string) string {
|
||||
ncols := len(headers)
|
||||
if ncols == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute max width per column.
|
||||
widths := make([]int, ncols)
|
||||
for i, h := range headers {
|
||||
if len(h) > widths[i] {
|
||||
widths[i] = len(h)
|
||||
}
|
||||
}
|
||||
for _, row := range rows {
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
if len(cell) > widths[i] {
|
||||
widths[i] = len(cell)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header row.
|
||||
b.WriteByte('|')
|
||||
for i, h := range headers {
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(h)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Separator row.
|
||||
b.WriteByte('|')
|
||||
for i := range headers {
|
||||
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||
b.WriteByte('|')
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Data rows.
|
||||
for _, row := range rows {
|
||||
b.WriteByte('|')
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(cell)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
403
audit/internal/platform/benchmark_test.go
Normal file
403
audit/internal/platform/benchmark_test.go
Normal file
@@ -0,0 +1,403 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
profile string
|
||||
want benchmarkProfileSpec
|
||||
}{
|
||||
{
|
||||
name: "default",
|
||||
profile: "",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "stability",
|
||||
profile: "stability",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||
},
|
||||
{
|
||||
name: "overnight",
|
||||
profile: "overnight",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := resolveBenchmarkProfile(tc.profile)
|
||||
if got != tc.want {
|
||||
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if len(labels) != 5 || len(phases) != 5 {
|
||||
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
|
||||
}
|
||||
if basePhaseSec != 60 {
|
||||
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 300 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||
}
|
||||
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 300 {
|
||||
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 3600 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||
benchmarkPrecisionPhases,
|
||||
func(label string) string { return label },
|
||||
)
|
||||
if basePhaseSec != 3600 {
|
||||
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||
}
|
||||
if mixedPhaseSec != 14400 {
|
||||
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||
}
|
||||
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
|
||||
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
phases := []benchmarkPlannedPhase{
|
||||
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||
}
|
||||
rows := []GPUMetricRow{
|
||||
{ElapsedSec: 5},
|
||||
{ElapsedSec: 15},
|
||||
{ElapsedSec: 25},
|
||||
{ElapsedSec: 65},
|
||||
}
|
||||
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||
if len(got["fp8"]) != 1 {
|
||||
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||
}
|
||||
if len(got["fp16"]) != 1 {
|
||||
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||
}
|
||||
if len(got["mixed"]) != 2 {
|
||||
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||
t.Fatalf("supported=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
raw string
|
||||
wantStatus string
|
||||
}{
|
||||
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
|
||||
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
|
||||
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
|
||||
if got != tc.wantStatus {
|
||||
t.Fatalf("status=%q want %q", got, tc.wantStatus)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
|
||||
Profile: "stability",
|
||||
RunNCCL: false,
|
||||
})
|
||||
if opts.Profile != NvidiaBenchmarkProfileStability {
|
||||
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
|
||||
}
|
||||
if opts.RunNCCL {
|
||||
t.Fatalf("RunNCCL should stay false when explicitly disabled")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := strings.Join([]string{
|
||||
"loader=bee-gpu-burn",
|
||||
"[gpu 0] device=NVIDIA H100",
|
||||
"[gpu 0] compute_capability=9.0",
|
||||
"[gpu 0] backend=cublasLt",
|
||||
"[gpu 0] duration_s=10",
|
||||
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||
"[gpu 0] int8_tensor_iterations=80",
|
||||
"[gpu 0] fp16_tensor_iterations=200",
|
||||
"[gpu 0] fp8_e4m3_iterations=50",
|
||||
"[gpu 0] status=OK",
|
||||
}, "\n")
|
||||
|
||||
got := parseBenchmarkBurnLog(raw)
|
||||
if got.Backend != "cublasLt" {
|
||||
t.Fatalf("backend=%q want cublasLt", got.Backend)
|
||||
}
|
||||
if got.ComputeCapability != "9.0" {
|
||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||
}
|
||||
if len(got.Profiles) != 3 {
|
||||
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||
}
|
||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||
}
|
||||
if got.Profiles[0].Category != "fp16_bf16" {
|
||||
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||
}
|
||||
if got.Profiles[1].Category != "fp8" {
|
||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||
}
|
||||
if got.Profiles[2].Category != "int8" {
|
||||
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||
}
|
||||
if got.Profiles[2].Weight != 0.25 {
|
||||
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
result := NvidiaBenchmarkResult{
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "PARTIAL",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "partial",
|
||||
},
|
||||
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
|
||||
GPUs: []BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100",
|
||||
Status: "OK",
|
||||
Steady: BenchmarkTelemetrySummary{
|
||||
AvgPowerW: 680,
|
||||
AvgTempC: 79,
|
||||
AvgGraphicsClockMHz: 1725,
|
||||
P95PowerW: 700,
|
||||
P95TempC: 82,
|
||||
P95GraphicsClockMHz: 1800,
|
||||
},
|
||||
Scores: BenchmarkScorecard{
|
||||
ComputeScore: 1200,
|
||||
PowerSustainScore: 96,
|
||||
ThermalSustainScore: 88,
|
||||
StabilityScore: 92,
|
||||
CompositeScore: 1176,
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
|
||||
},
|
||||
Throttle: BenchmarkThrottleCounters{
|
||||
SWPowerCapUS: 1000000,
|
||||
},
|
||||
DegradationReasons: []string{"power_capped"},
|
||||
},
|
||||
},
|
||||
Cooling: &BenchmarkCoolingSummary{
|
||||
Available: true,
|
||||
AvgFanRPM: 9200,
|
||||
FanDutyCycleAvailable: true,
|
||||
AvgFanDutyCyclePct: 47.5,
|
||||
P95FanDutyCyclePct: 62.0,
|
||||
},
|
||||
}
|
||||
|
||||
report := renderBenchmarkReport(result)
|
||||
for _, needle := range []string{
|
||||
"Executive Summary",
|
||||
"GPU 0 spent measurable time under SW power cap.",
|
||||
"1176.00",
|
||||
"fp16_tensor",
|
||||
"700.00",
|
||||
"Cooling",
|
||||
"Average fan duty cycle",
|
||||
"47.5%",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
report := renderBenchmarkReport(NvidiaBenchmarkResult{
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "OK",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "full",
|
||||
},
|
||||
})
|
||||
|
||||
for _, needle := range []string{
|
||||
"gpu-metrics.csv",
|
||||
"gpu-metrics.html",
|
||||
"gpu-burn.log",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
|
||||
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
|
||||
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
|
||||
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
|
||||
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
|
||||
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||
},
|
||||
})
|
||||
|
||||
if score.SyntheticScore != 100 {
|
||||
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
|
||||
}
|
||||
if score.MixedScore != 50 {
|
||||
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Clocks
|
||||
Graphics : 2422 MHz
|
||||
Memory : 12481 MHz
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
SM : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
Video : 2107 MHz
|
||||
|
||||
GPU 00000000:4F:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
`)
|
||||
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0"},
|
||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Max Clocks
|
||||
Graphics : 9999 MHz
|
||||
Memory : 9999 MHz
|
||||
`)
|
||||
// Already populated — must not be overwritten.
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
}
|
||||
455
audit/internal/platform/benchmark_types.go
Normal file
455
audit/internal/platform/benchmark_types.go
Normal file
@@ -0,0 +1,455 @@
|
||||
package platform
|
||||
|
||||
import "time"
|
||||
|
||||
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||
type BenchmarkHostConfig struct {
|
||||
CPUModel string `json:"cpu_model,omitempty"`
|
||||
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||
CPUCores int `json:"cpu_cores,omitempty"`
|
||||
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||
type BenchmarkCPULoad struct {
|
||||
AvgPct float64 `json:"avg_pct"`
|
||||
MaxPct float64 `json:"max_pct"`
|
||||
P95Pct float64 `json:"p95_pct"`
|
||||
Samples int `json:"samples"`
|
||||
// Status is "ok", "high", or "unstable".
|
||||
Status string `json:"status"`
|
||||
Note string `json:"note,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
|
||||
// benchmark run.
|
||||
type BenchmarkCoolingSummary struct {
|
||||
Available bool `json:"available"`
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaBenchmarkProfileStandard = "standard"
|
||||
NvidiaBenchmarkProfileStability = "stability"
|
||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||
)
|
||||
|
||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||
// re-measure from actual task logs and update the constants here.
|
||||
//
|
||||
// Sources:
|
||||
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
|
||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||
const (
|
||||
// Performance Benchmark (bee-gpu-burn).
|
||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||
// Sequential per-GPU mode scales approximately linearly.
|
||||
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
|
||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||
|
||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||
)
|
||||
|
||||
type NvidiaBenchmarkOptions struct {
|
||||
Profile string
|
||||
SizeMB int
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
RunNCCL bool
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||
RampTotal int // total number of ramp-up steps in this run
|
||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalizationGPU struct {
|
||||
Index int `json:"index"`
|
||||
PersistenceMode string `json:"persistence_mode,omitempty"`
|
||||
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
|
||||
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
|
||||
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
|
||||
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkGPUResult struct {
|
||||
Index int `json:"index"`
|
||||
UUID string `json:"uuid,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
VBIOS string `json:"vbios,omitempty"`
|
||||
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||
// Fallback: 80°C.
|
||||
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
|
||||
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||
PrecisionFailures []string `json:"precision_failures,omitempty"`
|
||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||
Scores BenchmarkScorecard `json:"scores"`
|
||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkTelemetrySummary struct {
|
||||
DurationSec float64 `json:"duration_sec"`
|
||||
Samples int `json:"samples"`
|
||||
AvgTempC float64 `json:"avg_temp_c"`
|
||||
P95TempC float64 `json:"p95_temp_c"`
|
||||
AvgPowerW float64 `json:"avg_power_w"`
|
||||
P95PowerW float64 `json:"p95_power_w"`
|
||||
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
|
||||
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
|
||||
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
|
||||
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
|
||||
AvgUsagePct float64 `json:"avg_usage_pct"`
|
||||
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
|
||||
ClockCVPct float64 `json:"clock_cv_pct"`
|
||||
PowerCVPct float64 `json:"power_cv_pct"`
|
||||
TempCVPct float64 `json:"temp_cv_pct"`
|
||||
ClockDriftPct float64 `json:"clock_drift_pct"`
|
||||
}
|
||||
|
||||
type BenchmarkThrottleCounters struct {
|
||||
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
|
||||
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
|
||||
SyncBoostUS uint64 `json:"sync_boost_us"`
|
||||
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
|
||||
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||
}
|
||||
|
||||
// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
|
||||
// Corrected = single-bit errors fixed by ECC (DRAM degradation).
|
||||
// Uncorrected = double-bit errors that could not be corrected (serious fault).
|
||||
// Both are volatile (since last driver reset), not persistent.
|
||||
type BenchmarkECCCounters struct {
|
||||
Corrected uint64 `json:"corrected"`
|
||||
Uncorrected uint64 `json:"uncorrected"`
|
||||
}
|
||||
|
||||
func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
|
||||
func (e BenchmarkECCCounters) IsZero() bool { return e.Corrected == 0 && e.Uncorrected == 0 }
|
||||
|
||||
type BenchmarkPrecisionResult struct {
|
||||
Name string `json:"name"`
|
||||
Category string `json:"category"`
|
||||
Supported bool `json:"supported"`
|
||||
Lanes int `json:"lanes,omitempty"`
|
||||
M uint64 `json:"m,omitempty"`
|
||||
N uint64 `json:"n,omitempty"`
|
||||
K uint64 `json:"k,omitempty"`
|
||||
Iterations uint64 `json:"iterations,omitempty"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
// Weight is the fp32-equivalence factor for this precision category.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||
Weight float64 `json:"weight,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkScorecard struct {
|
||||
ComputeScore float64 `json:"compute_score"`
|
||||
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
|
||||
// steady phases (each precision ran alone, full GPU dedicated).
|
||||
SyntheticScore float64 `json:"synthetic_score,omitempty"`
|
||||
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
|
||||
// (all precisions competing simultaneously — closer to real workloads).
|
||||
MixedScore float64 `json:"mixed_score,omitempty"`
|
||||
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
|
||||
// sustains throughput under concurrent mixed-precision load.
|
||||
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||
// StabilityScore: fraction of steady-state time the GPU spent throttling
|
||||
// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
|
||||
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||
|
||||
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||
// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
|
||||
// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
|
||||
TempHeadroomC float64 `json:"temp_headroom_c"`
|
||||
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
// ServerQualityScore (0–100) reflects server infrastructure quality independent
|
||||
// of GPU model. Combines throttle time, power variance, and temp variance.
|
||||
// Use this to compare servers with the same GPU, or to flag a bad server
|
||||
// that throttles an otherwise fast GPU.
|
||||
ServerQualityScore float64 `json:"server_quality_score"`
|
||||
// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
|
||||
// A throttling GPU will score lower here automatically — no quality multiplier.
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
|
||||
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
|
||||
// so benchmark and audit data can be correlated by slot.
|
||||
type BenchmarkPSUSlotPower struct {
|
||||
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
|
||||
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
|
||||
Status string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power from multiple independent
|
||||
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
|
||||
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
|
||||
// covers only a subset of installed PSUs (partial coverage).
|
||||
//
|
||||
// Source legend:
|
||||
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
|
||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
|
||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
|
||||
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
|
||||
|
||||
// PSU DC output sum — power delivered to server internals after conversion.
|
||||
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
|
||||
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
|
||||
|
||||
// Per-slot PSU readings at idle and at peak load.
|
||||
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
|
||||
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
|
||||
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
|
||||
|
||||
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
|
||||
// PCIe slot delivery only (excludes 16-pin connector power).
|
||||
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
|
||||
|
||||
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
|
||||
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
|
||||
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
|
||||
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||
// during a dedicated single-precision steady window. Because only one kernel
|
||||
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||
type BenchmarkPrecisionSteadyPhase struct {
|
||||
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||
Status string `json:"status,omitempty"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
// ECC errors accumulated during this precision phase only.
|
||||
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||
// Any uncorrected = serious fault triggered by this precision workload.
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkInterconnectResult struct {
|
||||
Status string `json:"status"`
|
||||
Attempted bool `json:"attempted"`
|
||||
Supported bool `json:"supported"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
|
||||
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
|
||||
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
|
||||
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
|
||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||
// this server under full GPU load. Use for rack power planning.
|
||||
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||
// actual wall-power draw as seen by the server's power supply.
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||
// unavailable or no PSU faults occurred during the test.
|
||||
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||
// stably with all other GPUs running simultaneously at their own limits.
|
||||
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||
// additional derating.
|
||||
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// Telemetry holds the aggregated stats from the final converged calibration
|
||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||
// Fan state sampled at the end of single-card calibration.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||
NewGPUIndex int `json:"new_gpu_index"`
|
||||
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
// ServerLoadedW is the IPMI server power reading captured during this
|
||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// PSU slot readings sampled at end of this ramp step.
|
||||
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||
// Fan state at end of this ramp step.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||
}
|
||||
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
// scalability ramp-up phase of the performance benchmark.
|
||||
type NvidiaPerformanceRampStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||
ScalabilityPct float64 `json:"scalability_pct"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
@@ -9,8 +9,50 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
var exportExecCommand = exec.Command
|
||||
|
||||
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
|
||||
func removableTargetReadOnly(fields map[string]string) bool {
|
||||
if fields["RO"] == "1" {
|
||||
return true
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
|
||||
case "iso9660", "squashfs":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func ensureWritableMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -34,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if !removable || fields["FSTYPE"] == "" {
|
||||
if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -52,7 +94,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
|
||||
if src == "" || target.Device == "" {
|
||||
return "", fmt.Errorf("source and target are required")
|
||||
}
|
||||
@@ -62,20 +104,43 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
|
||||
mountpoint := strings.TrimSpace(target.Mountpoint)
|
||||
mountedHere := false
|
||||
mounted := mountpoint != ""
|
||||
if mountpoint == "" {
|
||||
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
_ = os.Remove(mountpoint)
|
||||
return string(raw), err
|
||||
return "", formatMountTargetError(target, string(raw), err)
|
||||
}
|
||||
mountedHere = true
|
||||
mounted = true
|
||||
}
|
||||
defer func() {
|
||||
if !mounted {
|
||||
return
|
||||
}
|
||||
_ = exportExecCommand("sync").Run()
|
||||
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
retErr = err
|
||||
} else {
|
||||
retErr = fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
}
|
||||
if mountedHere {
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := ensureWritableMountpoint(mountpoint); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
filename := filepath.Base(src)
|
||||
dst := filepath.Join(mountpoint, filename)
|
||||
dst = filepath.Join(mountpoint, filename)
|
||||
data, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -83,12 +148,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
if err := os.WriteFile(dst, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = exec.Command("sync").Run()
|
||||
|
||||
if mountedHere {
|
||||
_ = exec.Command("umount", mountpoint).Run()
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
112
audit/internal/platform/export_test.go
Normal file
112
audit/internal/platform/export_test.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
|
||||
var calls [][]string
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
calls = append(calls, append([]string{name}, args...))
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
dst, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportFileToTarget error: %v", err)
|
||||
}
|
||||
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
|
||||
t.Fatalf("dst=%q want %q", got, want)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
|
||||
t.Fatalf("exported file missing: %v", err)
|
||||
}
|
||||
|
||||
foundUmount := false
|
||||
for _, call := range calls {
|
||||
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
|
||||
foundUmount = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundUmount {
|
||||
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
if err := os.Chmod(mountpoint, 0555); err != nil {
|
||||
t.Fatalf("chmod mountpoint: %v", err)
|
||||
}
|
||||
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
_, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-writable mountpoint")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "target filesystem is not writable") {
|
||||
t.Fatalf("err=%q want writable message", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
|
||||
oldExec := exportExecCommand
|
||||
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
|
||||
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
|
||||
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
|
||||
return cmd
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
targets, err := s.ListRemovableTargets()
|
||||
if err != nil {
|
||||
t.Fatalf("ListRemovableTargets error: %v", err)
|
||||
}
|
||||
if len(targets) != 1 {
|
||||
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
|
||||
}
|
||||
if got := targets[0].Device; got != "/dev/sdb1" {
|
||||
t.Fatalf("device=%q want /dev/sdb1", got)
|
||||
}
|
||||
}
|
||||
554
audit/internal/platform/gpu_metrics.go
Normal file
554
audit/internal/platform/gpu_metrics.go
Normal file
@@ -0,0 +1,554 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
Stage string `json:"stage,omitempty"`
|
||||
StageStartSec float64 `json:"stage_start_sec,omitempty"`
|
||||
StageEndSec float64 `json:"stage_end_sec,omitempty"`
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||
}
|
||||
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var rows []GPUMetricRow
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 7 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
rows = append(rows, GPUMetricRow{
|
||||
GPUIndex: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
MemUsagePct: parseGPUFloat(parts[3]),
|
||||
PowerW: parseGPUFloat(parts[4]),
|
||||
ClockMHz: parseGPUFloat(parts[5]),
|
||||
MemClockMHz: parseGPUFloat(parts[6]),
|
||||
})
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
func parseGPUFloat(s string) float64 {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "N/A" || s == "[Not Supported]" || s == "" {
|
||||
return 0
|
||||
}
|
||||
v, _ := strconv.ParseFloat(s, 64)
|
||||
return v
|
||||
}
|
||||
|
||||
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
return sampleGPUMetrics(gpuIndices)
|
||||
}
|
||||
|
||||
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||
if len(lines) < 2 {
|
||||
return nil, fmt.Errorf("rocm-smi: insufficient output")
|
||||
}
|
||||
|
||||
// Parse header to find column indices by name.
|
||||
headers := strings.Split(lines[0], ",")
|
||||
colIdx := func(keywords ...string) int {
|
||||
for i, h := range headers {
|
||||
hl := strings.ToLower(strings.TrimSpace(h))
|
||||
for _, kw := range keywords {
|
||||
if strings.Contains(hl, kw) {
|
||||
return i
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
|
||||
idxUse := colIdx("gpu use (%)")
|
||||
idxMem := colIdx("vram%", "memory allocated")
|
||||
idxPow := colIdx("average graphics package power", "power (w)")
|
||||
|
||||
var rows []GPUMetricRow
|
||||
for _, line := range lines[1:] {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
idx := len(rows)
|
||||
row := GPUMetricRow{GPUIndex: idx}
|
||||
get := func(i int) float64 {
|
||||
if i < 0 || i >= len(parts) {
|
||||
return 0
|
||||
}
|
||||
v := strings.TrimSpace(parts[i])
|
||||
if strings.EqualFold(v, "n/a") {
|
||||
return 0
|
||||
}
|
||||
return parseGPUFloat(v)
|
||||
}
|
||||
row.TempC = get(idxTemp)
|
||||
row.UsagePct = get(idxUse)
|
||||
row.MemUsagePct = get(idxMem)
|
||||
row.PowerW = get(idxPow)
|
||||
rows = append(rows, row)
|
||||
}
|
||||
if len(rows) == 0 {
|
||||
return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||
for _, r := range rows {
|
||||
dutyAvail := 0
|
||||
if r.FanDutyCycleAvailable {
|
||||
dutyAvail = 1
|
||||
}
|
||||
dutyEstimated := 0
|
||||
if r.FanDutyCycleEstimated {
|
||||
dutyEstimated = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
type gpuMetricStageSpan struct {
|
||||
Name string
|
||||
Start float64
|
||||
End float64
|
||||
}
|
||||
|
||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
// Group by GPU index preserving order.
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
stageSpans := buildGPUMetricStageSpans(rows)
|
||||
stageColorByName := make(map[string]string, len(stageSpans))
|
||||
for i, span := range stageSpans {
|
||||
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
|
||||
}
|
||||
|
||||
var legend strings.Builder
|
||||
if len(stageSpans) > 0 {
|
||||
legend.WriteString(`<div class="stage-legend">`)
|
||||
for _, span := range stageSpans {
|
||||
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
|
||||
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
|
||||
}
|
||||
legend.WriteString(`</div>`)
|
||||
}
|
||||
|
||||
var svgs strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
|
||||
svgs.WriteString("\n")
|
||||
}
|
||||
|
||||
ts := time.Now().UTC().Format("2006-01-02 15:04:05 UTC")
|
||||
html := fmt.Sprintf(`<!DOCTYPE html>
|
||||
<html><head>
|
||||
<meta charset="utf-8">
|
||||
<title>GPU Stress Test Metrics</title>
|
||||
<style>
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
|
||||
*{box-sizing:border-box}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
|
||||
.page{padding:24px}
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
|
||||
.card-body{padding:16px}
|
||||
h1{font-size:22px;margin:0 0 6px}
|
||||
p{color:var(--muted);font-size:13px;margin:0 0 16px}
|
||||
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
|
||||
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
|
||||
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
|
||||
.chart-block{margin-top:16px}
|
||||
</style>
|
||||
</head><body>
|
||||
<div class="page">
|
||||
<div class="card">
|
||||
<div class="card-head">GPU Stress Test Metrics</div>
|
||||
<div class="card-body">
|
||||
<h1>GPU Stress Test Metrics</h1>
|
||||
<p>Generated %s</p>
|
||||
%s
|
||||
<div class="chart-block">%s</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`, ts, legend.String(), svgs.String())
|
||||
|
||||
return os.WriteFile(path, []byte(html), 0644)
|
||||
}
|
||||
|
||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
|
||||
// Layout
|
||||
const W, H = 960, 520
|
||||
const plotX1 = 120 // usage axis / chart left border
|
||||
const plotX2 = 840 // power axis / chart right border
|
||||
const plotY1 = 70 // top
|
||||
const plotY2 = 465 // bottom (PH = 395)
|
||||
const PW = plotX2 - plotX1
|
||||
const PH = plotY2 - plotY1
|
||||
// Outer axes
|
||||
const tempAxisX = 60 // temp axis line
|
||||
const clockAxisX = 900 // clock axis line
|
||||
|
||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||
seriesLabel := [4]string{
|
||||
fmt.Sprintf("GPU %d Temp (°C)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Usage (%%)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Power (W)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Clock (MHz)", gpuIdx),
|
||||
}
|
||||
axisLabel := [4]string{"Temperature (°C)", "GPU Usage (%)", "Power (W)", "Clock (MHz)"}
|
||||
|
||||
// Extract series
|
||||
t := make([]float64, len(rows))
|
||||
vals := [4][]float64{}
|
||||
for i := range vals {
|
||||
vals[i] = make([]float64, len(rows))
|
||||
}
|
||||
for i, r := range rows {
|
||||
t[i] = r.ElapsedSec
|
||||
vals[0][i] = r.TempC
|
||||
vals[1][i] = r.UsagePct
|
||||
vals[2][i] = r.PowerW
|
||||
vals[3][i] = r.ClockMHz
|
||||
}
|
||||
|
||||
tMin, tMax := gpuMinMax(t)
|
||||
type axisScale struct {
|
||||
ticks []float64
|
||||
min, max float64
|
||||
}
|
||||
var axes [4]axisScale
|
||||
for i := 0; i < 4; i++ {
|
||||
mn, mx := gpuMinMax(vals[i])
|
||||
tks := gpuNiceTicks(mn, mx, 8)
|
||||
axes[i] = axisScale{ticks: tks, min: tks[0], max: tks[len(tks)-1]}
|
||||
}
|
||||
|
||||
xv := func(tv float64) float64 {
|
||||
if tMax == tMin {
|
||||
return float64(plotX1)
|
||||
}
|
||||
return float64(plotX1) + (tv-tMin)/(tMax-tMin)*float64(PW)
|
||||
}
|
||||
yv := func(v float64, ai int) float64 {
|
||||
a := axes[ai]
|
||||
if a.max == a.min {
|
||||
return float64(plotY1 + PH/2)
|
||||
}
|
||||
return float64(plotY2) - (v-a.min)/(a.max-a.min)*float64(PH)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
fmt.Fprintf(&b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d"`+
|
||||
` style="background:#fff;border-radius:8px;display:block;margin:0 auto 24px;`+
|
||||
`box-shadow:0 2px 12px rgba(0,0,0,.12)">`+"\n", W, H)
|
||||
|
||||
// Title
|
||||
fmt.Fprintf(&b, `<text x="%d" y="22" text-anchor="middle" font-family="sans-serif"`+
|
||||
` font-size="14" font-weight="bold" fill="#333">GPU Stress Test Metrics — GPU %d</text>`+"\n",
|
||||
plotX1+PW/2, gpuIdx)
|
||||
|
||||
// Horizontal grid (align to temp axis ticks)
|
||||
b.WriteString(`<g stroke="#e0e0e0" stroke-width="0.5">` + "\n")
|
||||
for _, tick := range axes[0].ticks {
|
||||
y := yv(tick, 0)
|
||||
if y < float64(plotY1) || y > float64(plotY2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||
plotX1, y, plotX2, y)
|
||||
}
|
||||
// Vertical grid
|
||||
xTicks := gpuNiceTicks(tMin, tMax, 10)
|
||||
for _, tv := range xTicks {
|
||||
x := xv(tv)
|
||||
if x < float64(plotX1) || x > float64(plotX2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||
x, plotY1, x, plotY2)
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Stage backgrounds
|
||||
for _, span := range stageSpans {
|
||||
x1 := xv(span.Start)
|
||||
x2 := xv(span.End)
|
||||
if x2 < x1 {
|
||||
x1, x2 = x2, x1
|
||||
}
|
||||
if x2-x1 < 1 {
|
||||
x2 = x1 + 1
|
||||
}
|
||||
color := stageColorByName[span.Name]
|
||||
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
|
||||
x1, plotY1, x2-x1, PH, color)
|
||||
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
|
||||
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
|
||||
}
|
||||
|
||||
// Chart border
|
||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
plotX1, plotY1, PW, PH)
|
||||
|
||||
// X axis ticks and labels
|
||||
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#333" text-anchor="middle">` + "\n")
|
||||
for _, tv := range xTicks {
|
||||
x := xv(tv)
|
||||
if x < float64(plotX1) || x > float64(plotX2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, plotY2+18, gpuFormatTick(tv))
|
||||
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
x, plotY2, x, plotY2+4)
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="13"`+
|
||||
` fill="#333" text-anchor="middle">Time (seconds)</text>`+"\n",
|
||||
plotX1+PW/2, plotY2+38)
|
||||
|
||||
// Y axes: [tempAxisX, plotX1, plotX2, clockAxisX]
|
||||
axisLineX := [4]int{tempAxisX, plotX1, plotX2, clockAxisX}
|
||||
axisRight := [4]bool{false, false, true, true}
|
||||
// Label x positions (for rotated vertical text)
|
||||
axisLabelX := [4]int{10, 68, 868, 950}
|
||||
|
||||
for i := 0; i < 4; i++ {
|
||||
ax := axisLineX[i]
|
||||
right := axisRight[i]
|
||||
color := colors[i]
|
||||
|
||||
// Axis line
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||
ax, plotY1, ax, plotY2, color)
|
||||
|
||||
// Ticks and tick labels
|
||||
fmt.Fprintf(&b, `<g font-family="sans-serif" font-size="10" fill="%s">`+"\n", color)
|
||||
for _, tick := range axes[i].ticks {
|
||||
y := yv(tick, i)
|
||||
if y < float64(plotY1) || y > float64(plotY2) {
|
||||
continue
|
||||
}
|
||||
dx := -5
|
||||
textX := ax - 8
|
||||
anchor := "end"
|
||||
if right {
|
||||
dx = 5
|
||||
textX = ax + 8
|
||||
anchor = "start"
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"`+
|
||||
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||
ax, y, ax+dx, y, color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="%s" dy="4">%s</text>`+"\n",
|
||||
textX, y, anchor, gpuFormatTick(tick))
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Axis label (rotated)
|
||||
lx := axisLabelX[i]
|
||||
fmt.Fprintf(&b, `<text transform="translate(%d,%d) rotate(-90)"`+
|
||||
` font-family="sans-serif" font-size="12" fill="%s" text-anchor="middle">%s</text>`+"\n",
|
||||
lx, plotY1+PH/2, color, axisLabel[i])
|
||||
}
|
||||
|
||||
// Data lines
|
||||
for i := 0; i < 4; i++ {
|
||||
var pts strings.Builder
|
||||
for j := range rows {
|
||||
x := xv(t[j])
|
||||
y := yv(vals[i][j], i)
|
||||
if j == 0 {
|
||||
fmt.Fprintf(&pts, "%.1f,%.1f", x, y)
|
||||
} else {
|
||||
fmt.Fprintf(&pts, " %.1f,%.1f", x, y)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="1.5"/>`+"\n",
|
||||
pts.String(), colors[i])
|
||||
}
|
||||
|
||||
// Legend
|
||||
const legendY = 42
|
||||
for i := 0; i < 4; i++ {
|
||||
lx := plotX1 + i*(PW/4) + 10
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||
` stroke="%s" stroke-width="2"/>`+"\n",
|
||||
lx, legendY, lx+20, legendY, colors[i])
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="12" fill="#333">%s</text>`+"\n",
|
||||
lx+25, legendY+4, seriesLabel[i])
|
||||
}
|
||||
|
||||
b.WriteString("</svg>\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func gpuMinMax(vals []float64) (float64, float64) {
|
||||
if len(vals) == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
mn, mx := vals[0], vals[0]
|
||||
for _, v := range vals[1:] {
|
||||
if v < mn {
|
||||
mn = v
|
||||
}
|
||||
if v > mx {
|
||||
mx = v
|
||||
}
|
||||
}
|
||||
return mn, mx
|
||||
}
|
||||
|
||||
func gpuNiceTicks(mn, mx float64, targetCount int) []float64 {
|
||||
if mn == mx {
|
||||
mn -= 1
|
||||
mx += 1
|
||||
}
|
||||
r := mx - mn
|
||||
step := math.Pow(10, math.Floor(math.Log10(r/float64(targetCount))))
|
||||
for _, f := range []float64{1, 2, 5, 10} {
|
||||
if r/(f*step) <= float64(targetCount)*1.5 {
|
||||
step = f * step
|
||||
break
|
||||
}
|
||||
}
|
||||
lo := math.Floor(mn/step) * step
|
||||
hi := math.Ceil(mx/step) * step
|
||||
var ticks []float64
|
||||
for v := lo; v <= hi+step*0.001; v += step {
|
||||
ticks = append(ticks, math.Round(v*1e9)/1e9)
|
||||
}
|
||||
return ticks
|
||||
}
|
||||
|
||||
func gpuFormatTick(v float64) string {
|
||||
if v == math.Trunc(v) {
|
||||
return strconv.Itoa(int(v))
|
||||
}
|
||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||
}
|
||||
|
||||
var gpuMetricStagePalette = []string{
|
||||
"#d95c5c",
|
||||
"#2185d0",
|
||||
"#21ba45",
|
||||
"#f2c037",
|
||||
"#6435c9",
|
||||
"#00b5ad",
|
||||
"#a5673f",
|
||||
}
|
||||
|
||||
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||
var spans []gpuMetricStageSpan
|
||||
for _, row := range rows {
|
||||
name := strings.TrimSpace(row.Stage)
|
||||
if name == "" {
|
||||
name = "run"
|
||||
}
|
||||
start := row.StageStartSec
|
||||
end := row.StageEndSec
|
||||
if end <= start {
|
||||
start = row.ElapsedSec
|
||||
end = row.ElapsedSec
|
||||
}
|
||||
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
|
||||
continue
|
||||
}
|
||||
if start < spans[len(spans)-1].Start {
|
||||
spans[len(spans)-1].Start = start
|
||||
}
|
||||
if end > spans[len(spans)-1].End {
|
||||
spans[len(spans)-1].End = end
|
||||
}
|
||||
}
|
||||
for i := range spans {
|
||||
if spans[i].End <= spans[i].Start {
|
||||
spans[i].End = spans[i].Start + 1
|
||||
}
|
||||
}
|
||||
return spans
|
||||
}
|
||||
|
||||
var gpuHTMLReplacer = strings.NewReplacer(
|
||||
"&", "&",
|
||||
"<", "<",
|
||||
">", ">",
|
||||
`"`, """,
|
||||
"'", "'",
|
||||
)
|
||||
|
||||
func gpuHTMLEscape(s string) string {
|
||||
return gpuHTMLReplacer.Replace(s)
|
||||
}
|
||||
65
audit/internal/platform/gpu_metrics_test.go
Normal file
65
audit/internal/platform/gpu_metrics_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.csv")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
|
||||
}
|
||||
if err := WriteGPUMetricsCSV(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsCSV: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage,elapsed_sec,gpu_index",
|
||||
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("csv missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.html")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
|
||||
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
|
||||
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
|
||||
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
|
||||
}
|
||||
if err := WriteGPUMetricsHTML(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsHTML: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage-legend",
|
||||
"baseline",
|
||||
"steady-fp16",
|
||||
"GPU Stress Test Metrics",
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("html missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
269
audit/internal/platform/install.go
Normal file
269
audit/internal/platform/install.go
Normal file
@@ -0,0 +1,269 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
||||
|
||||
// ListInstallDisks returns block devices suitable for installation.
|
||||
// Excludes the current live boot medium but includes USB drives.
|
||||
func (s *System) ListInstallDisks() ([]InstallDisk, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,MODEL,SIZE,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("lsblk: %w", err)
|
||||
}
|
||||
|
||||
bootDev := findLiveBootDevice()
|
||||
|
||||
var disks []InstallDisk
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
// NAME MODEL SIZE TYPE TRAN — model may have spaces so we parse from end
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
// Last field: TRAN, second-to-last: TYPE, third-to-last: SIZE
|
||||
typ := fields[len(fields)-2]
|
||||
size := fields[len(fields)-3]
|
||||
name := fields[0]
|
||||
model := strings.Join(fields[1:len(fields)-3], " ")
|
||||
|
||||
if typ != "disk" {
|
||||
continue
|
||||
}
|
||||
|
||||
device := "/dev/" + name
|
||||
if device == bootDev {
|
||||
continue
|
||||
}
|
||||
|
||||
sizeBytes := diskSizeBytes(device)
|
||||
mounted := mountedParts(device)
|
||||
|
||||
disks = append(disks, InstallDisk{
|
||||
Device: device,
|
||||
Model: strings.TrimSpace(model),
|
||||
Size: size,
|
||||
SizeBytes: sizeBytes,
|
||||
MountedParts: mounted,
|
||||
})
|
||||
}
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
// diskSizeBytes returns the byte size of a block device using lsblk.
|
||||
func diskSizeBytes(device string) int64 {
|
||||
out, err := exec.Command("lsblk", "-bdn", "-o", "SIZE", device).Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64)
|
||||
return n
|
||||
}
|
||||
|
||||
// mountedParts returns a list of "<part> at <mountpoint>" strings for any
|
||||
// mounted partitions on the given device.
|
||||
func mountedParts(device string) []string {
|
||||
out, err := exec.Command("lsblk", "-n", "-o", "NAME,MOUNTPOINT", device).Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var result []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
mp := fields[1]
|
||||
if mp == "" || mp == "[SWAP]" {
|
||||
continue
|
||||
}
|
||||
result = append(result, "/dev/"+strings.TrimLeft(fields[0], "└─├─")+" at "+mp)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// findLiveBootDevice returns the block device backing /run/live/medium (if any).
|
||||
func findLiveBootDevice() string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
src := strings.TrimSpace(string(out))
|
||||
if src == "" {
|
||||
return ""
|
||||
}
|
||||
// Strip partition suffix to get the whole disk device.
|
||||
// e.g. /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||
out2, err := exec.Command("lsblk", "-no", "PKNAME", src).Output()
|
||||
if err != nil || strings.TrimSpace(string(out2)) == "" {
|
||||
return src
|
||||
}
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
func mountSource(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func mountFSType(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceType(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceTransport(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
switch {
|
||||
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||
return "ram"
|
||||
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||
return "cdrom"
|
||||
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||
return "usb"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||
return "cdrom"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||
return "disk"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
func MinInstallBytes() int64 {
|
||||
fi, err := os.Stat(squashfsPath)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return fi.Size() * 3 / 2
|
||||
}
|
||||
|
||||
// toramActive returns true when the live system was booted with toram.
|
||||
func toramActive() bool {
|
||||
data, err := os.ReadFile("/proc/cmdline")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.Contains(string(data), "toram")
|
||||
}
|
||||
|
||||
// freeMemBytes returns MemAvailable from /proc/meminfo.
|
||||
func freeMemBytes() int64 {
|
||||
data, err := os.ReadFile("/proc/meminfo")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
if strings.HasPrefix(line, "MemAvailable:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
n, _ := strconv.ParseInt(fields[1], 10, 64)
|
||||
return n * 1024 // kB → bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// DiskWarnings returns advisory warning strings for a disk candidate.
|
||||
func DiskWarnings(d InstallDisk) []string {
|
||||
var w []string
|
||||
if len(d.MountedParts) > 0 {
|
||||
w = append(w, "has mounted partitions: "+strings.Join(d.MountedParts, ", "))
|
||||
}
|
||||
min := MinInstallBytes()
|
||||
if min > 0 && d.SizeBytes > 0 && d.SizeBytes < min {
|
||||
w = append(w, fmt.Sprintf("disk may be too small (need ≥ %s, have %s)",
|
||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||
}
|
||||
if toramActive() {
|
||||
sqFi, err := os.Stat(squashfsPath)
|
||||
if err == nil {
|
||||
free := freeMemBytes()
|
||||
if free > 0 && free < sqFi.Size()*2 {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
}
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
func humanBytes(b int64) string {
|
||||
const unit = 1024
|
||||
if b < unit {
|
||||
return fmt.Sprintf("%d B", b)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := b / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
// InstallToDisk runs bee-install <device> <logfile> and streams output to logFile.
|
||||
// The context can be used to cancel.
|
||||
func (s *System) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
cmd := exec.CommandContext(ctx, "bee-install", device, logFile)
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// InstallLogPath returns the default install log path for a given device.
|
||||
func InstallLogPath(device string) string {
|
||||
safe := strings.NewReplacer("/", "_", " ", "_").Replace(device)
|
||||
return "/tmp/bee-install" + safe + ".log"
|
||||
}
|
||||
|
||||
// Label returns a display label for a disk.
|
||||
func (d InstallDisk) Label() string {
|
||||
model := d.Model
|
||||
if model == "" {
|
||||
model = "Unknown"
|
||||
}
|
||||
return fmt.Sprintf("%s %s %s", d.Device, d.Size, model)
|
||||
}
|
||||
422
audit/internal/platform/install_to_ram.go
Normal file
422
audit/internal/platform/install_to_ram.go
Normal file
@@ -0,0 +1,422 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const installToRAMDir = "/dev/shm/bee-live"
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
return s.LiveMediaRAMState().InRAM
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
source := mountSource("/run/live/medium")
|
||||
device := findLiveBootDevice()
|
||||
status := LiveBootSource{
|
||||
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||
Source: source,
|
||||
Device: device,
|
||||
}
|
||||
if fsType == "" && source == "" && device == "" {
|
||||
if toramActive() {
|
||||
status.InRAM = true
|
||||
status.Kind = "ram"
|
||||
status.Source = "tmpfs"
|
||||
return status
|
||||
}
|
||||
status.Kind = "unknown"
|
||||
return status
|
||||
}
|
||||
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||
if status.Kind == "" {
|
||||
status.Kind = "unknown"
|
||||
}
|
||||
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||
status.Source = "tmpfs"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
|
||||
return evaluateLiveMediaRAMState(
|
||||
s.LiveBootSource(),
|
||||
toramActive(),
|
||||
globPaths("/run/live/medium/live/*.squashfs"),
|
||||
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
|
||||
)
|
||||
}
|
||||
|
||||
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
|
||||
state := LiveMediaRAMState{
|
||||
LiveBootSource: status,
|
||||
ToramActive: toram,
|
||||
CopyPresent: len(copiedSquashfs) > 0,
|
||||
}
|
||||
if status.InRAM {
|
||||
state.State = "in_ram"
|
||||
state.Status = "ok"
|
||||
state.CopyComplete = true
|
||||
state.Message = "Running from RAM — installation media can be safely disconnected."
|
||||
return state
|
||||
}
|
||||
|
||||
expected := pathBaseSet(sourceSquashfs)
|
||||
copied := pathBaseSet(copiedSquashfs)
|
||||
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
|
||||
|
||||
switch {
|
||||
case state.CopyComplete:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
|
||||
case state.CopyPresent:
|
||||
state.State = "partial"
|
||||
state.Status = "partial"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
|
||||
case toram:
|
||||
state.State = "toram_failed"
|
||||
state.Status = "failed"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
|
||||
default:
|
||||
state.State = "not_in_ram"
|
||||
state.Status = "warning"
|
||||
state.CanStartCopy = true
|
||||
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
|
||||
}
|
||||
return state
|
||||
}
|
||||
|
||||
func globPaths(pattern string) []string {
|
||||
matches, _ := filepath.Glob(pattern)
|
||||
return matches
|
||||
}
|
||||
|
||||
func pathBaseSet(paths []string) map[string]struct{} {
|
||||
out := make(map[string]struct{}, len(paths))
|
||||
for _, path := range paths {
|
||||
base := strings.TrimSpace(filepath.Base(path))
|
||||
if base != "" {
|
||||
out[base] = struct{}{}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func setContainsAll(have, want map[string]struct{}) bool {
|
||||
if len(want) == 0 {
|
||||
return false
|
||||
}
|
||||
for name := range want {
|
||||
if _, ok := have[name]; !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
state := s.LiveMediaRAMState()
|
||||
if state.InRAM {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
|
||||
dstDir := installToRAMDir
|
||||
|
||||
// If the source medium is unavailable, check whether a previous run already
|
||||
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||
// directly to the loop-rebind / bind-mount steps.
|
||||
if !sourceAvailable {
|
||||
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(copiedFiles) > 0 {
|
||||
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||
// Proceed to rebind with the already-copied files.
|
||||
for _, dst := range copiedFiles {
|
||||
base := filepath.Base(dst)
|
||||
// Re-associate the loop device that was originally backed by the
|
||||
// source file (now gone); find it by the old source path pattern.
|
||||
srcGuess := "/run/live/medium/live/" + base
|
||||
loopDev, lerr := findLoopForFile(srcGuess)
|
||||
if lerr != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||
continue
|
||||
}
|
||||
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||
}
|
||||
|
||||
{
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
}
|
||||
|
||||
if state.CopyPresent {
|
||||
log("Removing stale partial RAM copy before retry...")
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if retErr == nil {
|
||||
return
|
||||
}
|
||||
_ = os.RemoveAll(dstDir)
|
||||
log("Removed incomplete RAM copy.")
|
||||
}()
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(sf)
|
||||
dst := filepath.Join(dstDir, base)
|
||||
log(fmt.Sprintf("Copying %s to RAM...", base))
|
||||
if err := copyFileLarge(ctx, sf, dst, log); err != nil {
|
||||
return fmt.Errorf("copy %s: %v", base, err)
|
||||
}
|
||||
log(fmt.Sprintf("Copied %s.", base))
|
||||
|
||||
loopDev, err := findLoopForFile(sf)
|
||||
if err != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
|
||||
continue
|
||||
}
|
||||
if err := reassociateLoopDevice(loopDev, dst); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
|
||||
bindMedium:
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
mediumRebound := false
|
||||
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
|
||||
} else {
|
||||
mediumRebound = true
|
||||
}
|
||||
|
||||
log("Verifying live medium now served from RAM...")
|
||||
status := s.LiveBootSource()
|
||||
if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
|
||||
return err
|
||||
}
|
||||
if status.InRAM {
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
}
|
||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
}
|
||||
|
||||
// The live medium mount was not redirected to RAM. This is expected when
|
||||
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
|
||||
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
|
||||
// because the CD-ROM mount is in use. Check whether files were at least
|
||||
// copied to the tmpfs directory — that is sufficient for safe disconnection
|
||||
// once the kernel has paged in all actively-used data.
|
||||
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(files) > 0 {
|
||||
if !mediumRebound {
|
||||
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
|
||||
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
|
||||
}
|
||||
|
||||
func describeLiveBootSource(status LiveBootSource) string {
|
||||
source := strings.TrimSpace(status.Device)
|
||||
if source == "" {
|
||||
source = strings.TrimSpace(status.Source)
|
||||
}
|
||||
if source == "" {
|
||||
source = "unknown source"
|
||||
}
|
||||
switch strings.TrimSpace(status.Kind) {
|
||||
case "ram":
|
||||
return "RAM"
|
||||
case "usb":
|
||||
return "USB (" + source + ")"
|
||||
case "cdrom":
|
||||
return "CD-ROM (" + source + ")"
|
||||
case "disk":
|
||||
return "disk (" + source + ")"
|
||||
default:
|
||||
return source
|
||||
}
|
||||
}
|
||||
|
||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
fi, err := in.Stat()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
n, err := in.Read(buf)
|
||||
if n > 0 {
|
||||
if _, werr := out.Write(buf[:n]); werr != nil {
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
}
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rel, _ := filepath.Rel(src, path)
|
||||
target := filepath.Join(dst, rel)
|
||||
if fi.IsDir() {
|
||||
return os.MkdirAll(target, fi.Mode())
|
||||
}
|
||||
if strings.HasSuffix(path, ".squashfs") {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(target); err == nil {
|
||||
return nil
|
||||
}
|
||||
return copyFileLarge(ctx, path, target, nil)
|
||||
})
|
||||
}
|
||||
|
||||
func findLoopForFile(backingFile string) (string, error) {
|
||||
out, err := exec.Command("losetup", "--list", "--json").Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Name string `json:"name"`
|
||||
BackFile string `json:"back-file"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, dev := range result.Loopdevices {
|
||||
if dev.BackFile == backingFile {
|
||||
return dev.Name, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||
}
|
||||
|
||||
// loopDeviceOffset returns the byte offset configured for the loop device,
|
||||
// or -1 if it cannot be determined.
|
||||
func loopDeviceOffset(loopDev string) int64 {
|
||||
out, err := exec.Command("losetup", "--json", loopDev).Output()
|
||||
if err != nil {
|
||||
return -1
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Offset int64 `json:"offset"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
|
||||
return -1
|
||||
}
|
||||
return result.Loopdevices[0].Offset
|
||||
}
|
||||
|
||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
|
||||
// typically set up with a non-zero offset (squashfs lives inside the ISO),
|
||||
// so the ioctl returns EINVAL. Detect this early for a clear error message.
|
||||
if off := loopDeviceOffset(loopDev); off > 0 {
|
||||
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
|
||||
}
|
||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||
return nil
|
||||
}
|
||||
return loopChangeFD(loopDev, newFile)
|
||||
}
|
||||
33
audit/internal/platform/install_to_ram_linux.go
Normal file
33
audit/internal/platform/install_to_ram_linux.go
Normal file
@@ -0,0 +1,33 @@
|
||||
//go:build linux
|
||||
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
const ioctlLoopChangeFD = 0x4C08
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer lf.Close()
|
||||
nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer nf.Close()
|
||||
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
|
||||
if errno != 0 {
|
||||
return errno
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
|
||||
func bindMount(src, dst string) error {
|
||||
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
|
||||
}
|
||||
13
audit/internal/platform/install_to_ram_other.go
Normal file
13
audit/internal/platform/install_to_ram_other.go
Normal file
@@ -0,0 +1,13 @@
|
||||
//go:build !linux
|
||||
|
||||
package platform
|
||||
|
||||
import "errors"
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||
}
|
||||
|
||||
func bindMount(src, dst string) error {
|
||||
return errors.New("bind mount not available on this platform")
|
||||
}
|
||||
103
audit/internal/platform/install_to_ram_test.go
Normal file
103
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
source string
|
||||
deviceType string
|
||||
transport string
|
||||
want string
|
||||
}{
|
||||
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dstDir := t.TempDir()
|
||||
|
||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
|
||||
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||
}
|
||||
|
||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
|
||||
if err == nil {
|
||||
t.Fatal("expected verification failure when media is still on USB")
|
||||
}
|
||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
|
||||
t.Fatalf("error=%q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeLiveBootSource(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
|
||||
t.Fatalf("got %q want RAM", got)
|
||||
}
|
||||
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
|
||||
t.Fatalf("got %q want /run/live/medium", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
t.Run("in_ram", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
|
||||
false,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("partial_copy_after_cancel", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
false,
|
||||
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
|
||||
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
|
||||
)
|
||||
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
if state.CopyComplete {
|
||||
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("toram_failed", func(t *testing.T) {
|
||||
state := evaluateLiveMediaRAMState(
|
||||
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||
true,
|
||||
nil,
|
||||
nil,
|
||||
)
|
||||
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
|
||||
t.Fatalf("state=%+v", state)
|
||||
}
|
||||
})
|
||||
}
|
||||
83
audit/internal/platform/kill_workers.go
Normal file
83
audit/internal/platform/kill_workers.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
// bee test worker processes that should be killed by KillTestWorkers.
|
||||
var workerPatterns = []string{
|
||||
"bee-gpu-burn",
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
"dcgmi",
|
||||
}
|
||||
|
||||
// KilledProcess describes a process that was sent SIGKILL.
|
||||
type KilledProcess struct {
|
||||
PID int `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
// The scan runs under a 5-second deadline to avoid blocking if the process
|
||||
// table is very large (e.g. after a stress test with thousands of children).
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
|
||||
return killed
|
||||
default:
|
||||
}
|
||||
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
pid, err := strconv.Atoi(e.Name())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// /proc/*/cmdline uses NUL bytes as argument separators.
|
||||
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
||||
exe := strings.TrimSpace(args[0])
|
||||
base := exe
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
401
audit/internal/platform/live_metrics.go
Normal file
401
audit/internal/platform/live_metrics.go
Normal file
@@ -0,0 +1,401 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
PSUs []PSUReading `json:"psus,omitempty"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// PSUReading is a per-slot power supply input power reading.
|
||||
type PSUReading struct {
|
||||
Slot int `json:"slot"`
|
||||
Name string `json:"name"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
type TempReading struct {
|
||||
Name string `json:"name"`
|
||||
Group string `json:"group,omitempty"`
|
||||
Celsius float64 `json:"celsius"`
|
||||
}
|
||||
|
||||
// SampleLiveMetrics collects a single metrics snapshot from all available
|
||||
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
|
||||
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
|
||||
func SampleLiveMetrics() LiveMetricSample {
|
||||
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
||||
|
||||
// GPU metrics — try NVIDIA first, fall back to AMD
|
||||
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
|
||||
s.GPUs = gpus
|
||||
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
|
||||
s.GPUs = amdGPUs
|
||||
}
|
||||
|
||||
// Fan speeds — skipped silently if ipmitool unavailable
|
||||
fans, _ := sampleFanSpeeds()
|
||||
s.Fans = fans
|
||||
|
||||
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
|
||||
if !hasTempGroup(s.Temps, "cpu") {
|
||||
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
|
||||
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
|
||||
}
|
||||
}
|
||||
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
|
||||
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||
s.PSUs = samplePSUPower()
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
|
||||
// Memory load — from /proc/meminfo
|
||||
s.MemLoadPct = sampleMemLoadPct()
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||
// the overall CPU utilisation percentage.
|
||||
func sampleCPULoadPct() float64 {
|
||||
total0, idle0 := readCPUStat()
|
||||
if total0 == 0 {
|
||||
return 0
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
total1, idle1 := readCPUStat()
|
||||
if total1 == 0 {
|
||||
return 0
|
||||
}
|
||||
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||
}
|
||||
|
||||
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||
dt := float64(total - prevTotal)
|
||||
di := float64(idle - prevIdle)
|
||||
if dt <= 0 {
|
||||
return 0
|
||||
}
|
||||
pct := (1 - di/dt) * 100
|
||||
if pct < 0 {
|
||||
return 0
|
||||
}
|
||||
if pct > 100 {
|
||||
return 100
|
||||
}
|
||||
return pct
|
||||
}
|
||||
|
||||
func readCPUStat() (total, idle uint64) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
defer f.Close()
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if !strings.HasPrefix(line, "cpu ") {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)[1:] // skip "cpu"
|
||||
var vals [10]uint64
|
||||
for i := 0; i < len(fields) && i < 10; i++ {
|
||||
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
|
||||
}
|
||||
// idle = idle + iowait
|
||||
idle = vals[3] + vals[4]
|
||||
for _, v := range vals {
|
||||
total += v
|
||||
}
|
||||
return total, idle
|
||||
}
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
func sampleMemLoadPct() float64 {
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
defer f.Close()
|
||||
vals := map[string]uint64{}
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
fields := strings.Fields(sc.Text())
|
||||
if len(fields) >= 2 {
|
||||
v, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
vals[strings.TrimSuffix(fields[0], ":")] = v
|
||||
}
|
||||
}
|
||||
total := vals["MemTotal"]
|
||||
avail := vals["MemAvailable"]
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
used := total - avail
|
||||
return float64(used) / float64(total) * 100
|
||||
}
|
||||
|
||||
func hasTempGroup(temps []TempReading, group string) bool {
|
||||
for _, t := range temps {
|
||||
if t.Group == group {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func sampleLiveTemperatureReadings() []TempReading {
|
||||
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
|
||||
return temps
|
||||
}
|
||||
return sampleLiveTempsViaIPMI()
|
||||
}
|
||||
|
||||
func sampleLiveTempsViaSensorsJSON() []TempReading {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(out, &doc); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
chips := make([]string, 0, len(doc))
|
||||
for chip := range doc {
|
||||
chips = append(chips, chip)
|
||||
}
|
||||
sort.Strings(chips)
|
||||
|
||||
temps := make([]TempReading, 0, len(chips))
|
||||
seen := map[string]struct{}{}
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
featureNames := make([]string, 0, len(features))
|
||||
for name := range features {
|
||||
featureNames = append(featureNames, name)
|
||||
}
|
||||
sort.Strings(featureNames)
|
||||
for _, name := range featureNames {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
feature, ok := features[name].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
value, ok := firstTempInputValue(feature)
|
||||
if !ok || value <= 0 || value > 150 {
|
||||
continue
|
||||
}
|
||||
group := classifyLiveTempGroup(chip, name)
|
||||
if group == "gpu" {
|
||||
continue
|
||||
}
|
||||
label := strings.TrimSpace(name)
|
||||
if label == "" {
|
||||
continue
|
||||
}
|
||||
if group == "ambient" {
|
||||
label = compactAmbientTempName(chip, label)
|
||||
}
|
||||
key := group + "\x00" + label
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||
}
|
||||
}
|
||||
return temps
|
||||
}
|
||||
|
||||
func sampleLiveTempsViaIPMI() []TempReading {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
var temps []TempReading
|
||||
seen := map[string]struct{}{}
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
unit := strings.ToLower(strings.TrimSpace(parts[2]))
|
||||
if !strings.Contains(unit, "degrees") {
|
||||
continue
|
||||
}
|
||||
raw := strings.TrimSpace(parts[1])
|
||||
if raw == "" || strings.EqualFold(raw, "na") {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.ParseFloat(raw, 64)
|
||||
if err != nil || value <= 0 || value > 150 {
|
||||
continue
|
||||
}
|
||||
group := classifyLiveTempGroup("", name)
|
||||
if group == "gpu" {
|
||||
continue
|
||||
}
|
||||
label := name
|
||||
if group == "ambient" {
|
||||
label = compactAmbientTempName("", label)
|
||||
}
|
||||
key := group + "\x00" + label
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||
}
|
||||
return temps
|
||||
}
|
||||
|
||||
func firstTempInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
|
||||
continue
|
||||
}
|
||||
switch value := feature[key].(type) {
|
||||
case float64:
|
||||
return value, true
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(value, 64)
|
||||
if err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func classifyLiveTempGroup(chip, name string) string {
|
||||
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
|
||||
switch {
|
||||
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
|
||||
return "gpu"
|
||||
case strings.Contains(text, "coretemp"),
|
||||
strings.Contains(text, "k10temp"),
|
||||
strings.Contains(text, "zenpower"),
|
||||
strings.Contains(text, "package id"),
|
||||
strings.Contains(text, "x86_pkg_temp"),
|
||||
strings.Contains(text, "tctl"),
|
||||
strings.Contains(text, "tdie"),
|
||||
strings.Contains(text, "tccd"),
|
||||
strings.Contains(text, "cpu"),
|
||||
strings.Contains(text, "peci"):
|
||||
return "cpu"
|
||||
default:
|
||||
return "ambient"
|
||||
}
|
||||
}
|
||||
|
||||
func compactAmbientTempName(chip, name string) string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
name = strings.TrimSpace(name)
|
||||
if chip == "" || strings.EqualFold(chip, name) {
|
||||
return name
|
||||
}
|
||||
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
|
||||
return name
|
||||
}
|
||||
return chip + " / " + name
|
||||
}
|
||||
|
||||
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||
// It parses `ipmitool sdr elist full` output looking for Power Supply entity
|
||||
// sensors (entity ID "10.N") that report a value in Watts.
|
||||
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||
func samplePSUPower() []PSUReading {
|
||||
out, err := exec.Command("ipmitool", "sdr", "elist", "full").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
// map slot → reading (keep highest-watt value per slot in case of duplicates)
|
||||
type entry struct {
|
||||
name string
|
||||
powerW float64
|
||||
}
|
||||
bySlot := map[int]entry{}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
entityID := strings.TrimSpace(parts[3]) // e.g. "10.1"
|
||||
if !strings.HasPrefix(entityID, "10.") {
|
||||
continue // not a Power Supply entity
|
||||
}
|
||||
slotStr := strings.TrimPrefix(entityID, "10.")
|
||||
slot, err := strconv.Atoi(slotStr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
valueField := strings.TrimSpace(parts[4]) // e.g. "740.00 Watts"
|
||||
if !strings.Contains(strings.ToLower(valueField), "watts") {
|
||||
continue
|
||||
}
|
||||
valueFields := strings.Fields(valueField)
|
||||
if len(valueFields) < 2 {
|
||||
continue
|
||||
}
|
||||
w, err := strconv.ParseFloat(valueFields[0], 64)
|
||||
if err != nil || w <= 0 {
|
||||
continue
|
||||
}
|
||||
sensorName := strings.TrimSpace(parts[0])
|
||||
if existing, ok := bySlot[slot]; !ok || w > existing.powerW {
|
||||
bySlot[slot] = entry{name: sensorName, powerW: w}
|
||||
}
|
||||
}
|
||||
if len(bySlot) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := make([]int, 0, len(bySlot))
|
||||
for s := range bySlot {
|
||||
slots = append(slots, s)
|
||||
}
|
||||
sort.Ints(slots)
|
||||
psus := make([]PSUReading, 0, len(slots))
|
||||
for _, s := range slots {
|
||||
e := bySlot[s]
|
||||
psus = append(psus, PSUReading{Slot: s, Name: e.name, PowerW: e.powerW})
|
||||
}
|
||||
return psus
|
||||
}
|
||||
94
audit/internal/platform/live_metrics_test.go
Normal file
94
audit/internal/platform/live_metrics_test.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestFirstTempInputValue(t *testing.T) {
|
||||
feature := map[string]any{
|
||||
"temp1_input": 61.5,
|
||||
"temp1_max": 80.0,
|
||||
}
|
||||
got, ok := firstTempInputValue(feature)
|
||||
if !ok {
|
||||
t.Fatal("expected value")
|
||||
}
|
||||
if got != 61.5 {
|
||||
t.Fatalf("got %v want 61.5", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyLiveTempGroup(t *testing.T) {
|
||||
tests := []struct {
|
||||
chip string
|
||||
name string
|
||||
want string
|
||||
}{
|
||||
{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
|
||||
{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
|
||||
{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
|
||||
{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
|
||||
t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCompactAmbientTempName(t *testing.T) {
|
||||
if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPULoadPctBetween(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
prevTotal uint64
|
||||
prevIdle uint64
|
||||
total uint64
|
||||
idle uint64
|
||||
want float64
|
||||
}{
|
||||
{
|
||||
name: "busy half",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 90,
|
||||
want: 50,
|
||||
},
|
||||
{
|
||||
name: "fully busy",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 40,
|
||||
want: 100,
|
||||
},
|
||||
{
|
||||
name: "no progress",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 100,
|
||||
idle: 40,
|
||||
want: 0,
|
||||
},
|
||||
{
|
||||
name: "idle delta larger than total clamps to zero",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 150,
|
||||
want: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package platform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
|
||||
out := make([]InterfaceInfo, 0, len(names))
|
||||
for _, name := range names {
|
||||
state := "unknown"
|
||||
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
|
||||
fields := strings.Fields(string(raw))
|
||||
if len(fields) >= 9 {
|
||||
state = fields[8]
|
||||
if up, err := interfaceAdminState(name); err == nil {
|
||||
if up {
|
||||
state = "up"
|
||||
} else {
|
||||
state = "down"
|
||||
}
|
||||
}
|
||||
|
||||
var ipv4 []string
|
||||
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 4 {
|
||||
ipv4 = append(ipv4, fields[3])
|
||||
}
|
||||
}
|
||||
ipv4, err := interfaceIPv4Addrs(name)
|
||||
if err != nil {
|
||||
ipv4 = nil
|
||||
}
|
||||
|
||||
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
||||
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
|
||||
names, err := listInterfaceNames()
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
|
||||
snapshot := NetworkSnapshot{
|
||||
Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
|
||||
}
|
||||
for _, name := range names {
|
||||
up, err := interfaceAdminState(name)
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
ipv4, err := interfaceIPv4Addrs(name)
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
|
||||
Name: name,
|
||||
Up: up,
|
||||
IPv4: ipv4,
|
||||
})
|
||||
}
|
||||
|
||||
if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
|
||||
snapshot.ResolvConf = string(raw)
|
||||
}
|
||||
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
|
||||
var errs []string
|
||||
|
||||
for _, iface := range snapshot.Interfaces {
|
||||
if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
|
||||
continue
|
||||
}
|
||||
if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
|
||||
}
|
||||
for _, cidr := range iface.IPv4 {
|
||||
if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
|
||||
detail := strings.TrimSpace(string(raw))
|
||||
if detail != "" {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
|
||||
} else {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
state := "down"
|
||||
if iface.Up {
|
||||
state = "up"
|
||||
}
|
||||
if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
|
||||
}
|
||||
}
|
||||
|
||||
if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
if !errors.As(err, &exitErr) {
|
||||
errs = append(errs, fmt.Sprintf("clear default route: %v", err))
|
||||
}
|
||||
}
|
||||
for _, route := range snapshot.DefaultRoutes {
|
||||
fields := strings.Fields(route)
|
||||
if len(fields) == 0 {
|
||||
continue
|
||||
}
|
||||
// Strip state flags that ip-route(8) does not accept as add arguments.
|
||||
filtered := fields[:0]
|
||||
for _, f := range fields {
|
||||
switch f {
|
||||
case "linkdown", "dead", "onlink", "pervasive":
|
||||
// skip
|
||||
default:
|
||||
filtered = append(filtered, f)
|
||||
}
|
||||
}
|
||||
args := append([]string{"route", "add"}, filtered...)
|
||||
if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
|
||||
detail := strings.TrimSpace(string(raw))
|
||||
if detail != "" {
|
||||
errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
|
||||
} else {
|
||||
errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errors.New(strings.Join(errs, "; "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *System) DHCPOne(iface string) (string, error) {
|
||||
var out bytes.Buffer
|
||||
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
||||
@@ -131,6 +241,65 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
// SetInterfaceState brings a network interface up or down.
|
||||
func (s *System) SetInterfaceState(iface string, up bool) error {
|
||||
state := "down"
|
||||
if up {
|
||||
state = "up"
|
||||
}
|
||||
return exec.Command("ip", "link", "set", "dev", iface, state).Run()
|
||||
}
|
||||
|
||||
// GetInterfaceState returns true if the interface is UP.
|
||||
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
return interfaceAdminState(iface)
|
||||
}
|
||||
|
||||
func interfaceAdminState(iface string) (bool, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return parseInterfaceAdminState(string(raw))
|
||||
}
|
||||
|
||||
func parseInterfaceAdminState(raw string) (bool, error) {
|
||||
start := strings.IndexByte(raw, '<')
|
||||
if start == -1 {
|
||||
return false, fmt.Errorf("ip link output missing flags")
|
||||
}
|
||||
end := strings.IndexByte(raw[start+1:], '>')
|
||||
if end == -1 {
|
||||
return false, fmt.Errorf("ip link output missing flag terminator")
|
||||
}
|
||||
flags := strings.Split(raw[start+1:start+1+end], ",")
|
||||
for _, flag := range flags {
|
||||
if strings.TrimSpace(flag) == "UP" {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||
raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
if errors.As(err, &exitErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
var ipv4 []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 4 {
|
||||
ipv4 = append(ipv4, fields[3])
|
||||
}
|
||||
}
|
||||
return ipv4, nil
|
||||
}
|
||||
|
||||
func listInterfaceNames() ([]string, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
||||
if err != nil {
|
||||
|
||||
46
audit/internal/platform/network_test.go
Normal file
46
audit/internal/platform/network_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseInterfaceAdminState(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
raw string
|
||||
want bool
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "admin up with no carrier",
|
||||
raw: "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "admin down",
|
||||
raw: "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "malformed output",
|
||||
raw: "2: enp1s0: mtu 1500 state DOWN\n",
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := parseInterfaceAdminState(tt.raw)
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != tt.want {
|
||||
t.Fatalf("got %v want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
209
audit/internal/platform/nvidia_stress.go
Normal file
209
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,209 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
normalizeNvidiaStressOptions(&opts)
|
||||
|
||||
job, err := buildNvidiaStressJob(opts)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case NvidiaStressLoaderJohn:
|
||||
return "gpu-nvidia-john"
|
||||
case NvidiaStressLoaderNCCL:
|
||||
return "gpu-nvidia-nccl"
|
||||
default:
|
||||
return "gpu-nvidia-burn"
|
||||
}
|
||||
}
|
||||
|
||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return satJob{}, err
|
||||
}
|
||||
|
||||
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||
switch loader {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-gpu-burn.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderJohn:
|
||||
cmd := []string{
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-john-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderNCCL:
|
||||
cmd := []string{
|
||||
"bee-nccl-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-nccl-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
default:
|
||||
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
case NvidiaStressLoaderJohn:
|
||||
opts.Loader = NvidiaStressLoaderJohn
|
||||
case NvidiaStressLoaderNCCL:
|
||||
opts.Loader = NvidiaStressLoaderNCCL
|
||||
default:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
}
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
}
|
||||
|
||||
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
|
||||
selected := all
|
||||
if len(include) > 0 {
|
||||
want := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
want[idx] = struct{}{}
|
||||
}
|
||||
selected = selected[:0]
|
||||
for _, idx := range all {
|
||||
if _, ok := want[idx]; ok {
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||
}
|
||||
out := append([]int(nil), selected...)
|
||||
sort.Ints(out)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func listNvidiaGPUIndices() ([]int, error) {
|
||||
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var indices []int
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
return dedupeSortedIndices(indices), nil
|
||||
}
|
||||
|
||||
func dedupeSortedIndices(values []int) []int {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[int]struct{}, len(values))
|
||||
out := make([]int, 0, len(values))
|
||||
for _, value := range values {
|
||||
if value < 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func joinIndexList(values []int) string {
|
||||
parts := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
parts = append(parts, strconv.Itoa(value))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
563
audit/internal/platform/platform_stress.go
Normal file
563
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,563 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PlatformStressCycle defines one load+idle cycle.
|
||||
type PlatformStressCycle struct {
|
||||
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||
IdleSec int // seconds of idle monitoring after load cut
|
||||
}
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
type platformStressRow struct {
|
||||
ElapsedSec float64
|
||||
Cycle int
|
||||
Phase string // "load" | "idle"
|
||||
CPULoadPct float64
|
||||
MaxCPUTempC float64
|
||||
MaxGPUTempC float64
|
||||
SysPowerW float64
|
||||
FanMinRPM float64
|
||||
FanMaxRPM float64
|
||||
GPUThrottled bool
|
||||
}
|
||||
|
||||
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||
func (s *System) RunPlatformStress(
|
||||
ctx context.Context,
|
||||
baseDir string,
|
||||
opts PlatformStressOptions,
|
||||
logFunc func(string),
|
||||
) (string, error) {
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if len(opts.Cycles) == 0 {
|
||||
return "", fmt.Errorf("no cycles defined")
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
|
||||
stamp := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
|
||||
var analyses []cycleAnalysis
|
||||
|
||||
for i, cycle := range opts.Cycles {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
cycleNum := i + 1
|
||||
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||
|
||||
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
if hasCPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
}
|
||||
|
||||
// GPU stress
|
||||
if hasGPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
}
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
for _, r := range loadRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, loadRows...)
|
||||
loadCancel()
|
||||
wg.Wait()
|
||||
|
||||
if len(loadRows) > 0 {
|
||||
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||
}
|
||||
|
||||
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||
for _, r := range idleRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, idleRows...)
|
||||
idleCancel()
|
||||
|
||||
// Per-cycle analysis
|
||||
an := analyzePlatformCycle(loadRows, idleRows)
|
||||
analyses = append(analyses, an)
|
||||
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||
}
|
||||
|
||||
// Write CSV
|
||||
csvData := writePlatformCSV(rows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||
|
||||
// Write summary
|
||||
summary := writePlatformSummary(opts, analyses)
|
||||
logFunc("--- Summary ---")
|
||||
for _, line := range strings.Split(summary, "\n") {
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||
var rows []platformStressRow
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return rows
|
||||
case <-ticker.C:
|
||||
sample := SampleLiveMetrics()
|
||||
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||
r := platformStressRow{
|
||||
ElapsedSec: time.Since(testStart).Seconds(),
|
||||
Cycle: cycle,
|
||||
Phase: phase,
|
||||
CPULoadPct: s.CPULoadPct,
|
||||
SysPowerW: s.PowerW,
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
switch t.Group {
|
||||
case "cpu":
|
||||
if t.Celsius > r.MaxCPUTempC {
|
||||
r.MaxCPUTempC = t.Celsius
|
||||
}
|
||||
case "gpu":
|
||||
if t.Celsius > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = t.Celsius
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
if g.TempC > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = g.TempC
|
||||
}
|
||||
}
|
||||
if len(s.Fans) > 0 {
|
||||
r.FanMinRPM = s.Fans[0].RPM
|
||||
r.FanMaxRPM = s.Fans[0].RPM
|
||||
for _, f := range s.Fans[1:] {
|
||||
if f.RPM < r.FanMinRPM {
|
||||
r.FanMinRPM = f.RPM
|
||||
}
|
||||
if f.RPM > r.FanMaxRPM {
|
||||
r.FanMaxRPM = f.RPM
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func formatPlatformRow(r platformStressRow) string {
|
||||
throttle := ""
|
||||
if r.GPUThrottled {
|
||||
throttle = " THROTTLE"
|
||||
}
|
||||
fans := ""
|
||||
if r.FanMinRPM > 0 {
|
||||
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||
}
|
||||
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||
}
|
||||
|
||||
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||
var an cycleAnalysis
|
||||
for _, r := range loadRows {
|
||||
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||
an.maxCPUTemp = r.MaxCPUTempC
|
||||
}
|
||||
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||
an.maxGPUTemp = r.MaxGPUTempC
|
||||
}
|
||||
if r.SysPowerW > an.maxPower {
|
||||
an.maxPower = r.SysPowerW
|
||||
}
|
||||
if r.GPUThrottled {
|
||||
an.throttled = true
|
||||
}
|
||||
}
|
||||
// Fan RPM at cut = avg of last 5 load rows
|
||||
if n := len(loadRows); n > 0 {
|
||||
window := loadRows
|
||||
if n > 5 {
|
||||
window = loadRows[n-5:]
|
||||
}
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range window {
|
||||
if r.FanMinRPM > 0 {
|
||||
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt > 0 {
|
||||
an.fanAtCutAvg = sum / float64(cnt)
|
||||
}
|
||||
}
|
||||
// Fan RPM min in first 15s of idle
|
||||
an.fanMin15s = an.fanAtCutAvg
|
||||
var cutElapsed float64
|
||||
if len(loadRows) > 0 {
|
||||
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||
}
|
||||
for _, r := range idleRows {
|
||||
if r.ElapsedSec > cutElapsed+15 {
|
||||
break
|
||||
}
|
||||
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||
an.fanMin15s = avg
|
||||
}
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||
}
|
||||
return an
|
||||
}
|
||||
|
||||
type cycleAnalysis struct {
|
||||
maxCPUTemp float64
|
||||
maxGPUTemp float64
|
||||
maxPower float64
|
||||
throttled bool
|
||||
fanAtCutAvg float64
|
||||
fanMin15s float64
|
||||
fanDropPct float64
|
||||
}
|
||||
|
||||
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||
|
||||
totalThrottle := 0
|
||||
totalFanWarn := 0
|
||||
for i, an := range analyses {
|
||||
cycle := opts.Cycles[i]
|
||||
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||
if an.throttled {
|
||||
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||
totalThrottle++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Throttle: none\n")
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||
if an.fanDropPct > 20 {
|
||||
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||
totalFanWarn++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||
if totalThrottle > 0 {
|
||||
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||
} else if totalFanWarn > 0 {
|
||||
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||
} else {
|
||||
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||
var buf bytes.Buffer
|
||||
w := csv.NewWriter(&buf)
|
||||
_ = w.Write([]string{
|
||||
"elapsed_sec", "cycle", "phase",
|
||||
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||
})
|
||||
for _, r := range rows {
|
||||
throttled := "0"
|
||||
if r.GPUThrottled {
|
||||
throttled = "1"
|
||||
}
|
||||
_ = w.Write([]string{
|
||||
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||
strconv.Itoa(r.Cycle),
|
||||
r.Phase,
|
||||
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||
throttled,
|
||||
})
|
||||
}
|
||||
w.Flush()
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
path, err := satLookPath("stressapptest")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||
}
|
||||
// Use a very long duration; the context timeout will kill it at the right time.
|
||||
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||
if threads := platformStressCPUThreads(); threads > 0 {
|
||||
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||
}
|
||||
if mb := platformStressMemoryMB(); mb > 0 {
|
||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||
}
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx, durSec)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := fmt.Sprintf(`actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: %d`, durSec*1000) + `
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||
// where the context is cancelled early (user stop, parent timeout).
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func platformStressCPUThreads() int {
|
||||
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||
return n
|
||||
}
|
||||
cpus := runtime.NumCPU()
|
||||
switch {
|
||||
case cpus <= 2:
|
||||
return 1
|
||||
case cpus <= 8:
|
||||
return cpus - 1
|
||||
default:
|
||||
return cpus - 2
|
||||
}
|
||||
}
|
||||
|
||||
func platformStressMemoryMB() int {
|
||||
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||
return mb
|
||||
}
|
||||
free := freeMemBytes()
|
||||
if free <= 0 {
|
||||
return 0
|
||||
}
|
||||
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||
if mb < 1024 {
|
||||
return 1024
|
||||
}
|
||||
return mb
|
||||
}
|
||||
|
||||
func containsComponent(components []string, name string) bool {
|
||||
for _, c := range components {
|
||||
if c == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
gz := gzip.NewWriter(f)
|
||||
defer gz.Close()
|
||||
tw := tar.NewWriter(gz)
|
||||
defer tw.Close()
|
||||
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(dir)
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
fpath := filepath.Join(dir, e.Name())
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
hdr := &tar.Header{
|
||||
Name: filepath.Join(base, e.Name()),
|
||||
Size: int64(len(data)),
|
||||
Mode: 0644,
|
||||
ModTime: time.Now(),
|
||||
}
|
||||
if err := tw.WriteHeader(hdr); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := tw.Write(data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||
if got := platformStressCPUThreads(); got != 7 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||
got := platformStressCPUThreads()
|
||||
if got < 1 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||
}
|
||||
if got > runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||
if got := platformStressMemoryMB(); got != 8192 {
|
||||
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||
}
|
||||
}
|
||||
344
audit/internal/platform/runtime.go
Normal file
344
audit/internal/platform/runtime.go
Normal file
@@ -0,0 +1,344 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
var runtimeRequiredTools = []string{
|
||||
"dmidecode",
|
||||
"lspci",
|
||||
"lsblk",
|
||||
"smartctl",
|
||||
"nvme",
|
||||
"ipmitool",
|
||||
"dhclient",
|
||||
"mount",
|
||||
}
|
||||
|
||||
var runtimeTrackedServices = []string{
|
||||
"bee-network",
|
||||
"bee-nvidia",
|
||||
"bee-preflight",
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
"nvidia-dcgm",
|
||||
"nvidia-fabricmanager",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
checkedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
health := schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
CheckedAt: checkedAt,
|
||||
ExportDir: strings.TrimSpace(exportDir),
|
||||
}
|
||||
|
||||
if health.ExportDir != "" {
|
||||
if err := os.MkdirAll(health.ExportDir, 0755); err != nil {
|
||||
health.Status = "FAILED"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "export_dir_unavailable",
|
||||
Severity: "critical",
|
||||
Description: err.Error(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
interfaces, err := s.ListInterfaces()
|
||||
if err == nil {
|
||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||
hasIPv4 := false
|
||||
missingIPv4 := false
|
||||
for _, iface := range interfaces {
|
||||
outcome := "no_offer"
|
||||
if len(iface.IPv4) > 0 {
|
||||
outcome = "lease_acquired"
|
||||
hasIPv4 = true
|
||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||
outcome = "link_down"
|
||||
} else {
|
||||
missingIPv4 = true
|
||||
}
|
||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||
Name: iface.Name,
|
||||
State: iface.State,
|
||||
IPv4: iface.IPv4,
|
||||
Outcome: outcome,
|
||||
})
|
||||
}
|
||||
switch {
|
||||
case hasIPv4 && !missingIPv4:
|
||||
health.NetworkStatus = "OK"
|
||||
case hasIPv4:
|
||||
health.NetworkStatus = "PARTIAL"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_partial",
|
||||
Severity: "warning",
|
||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
||||
})
|
||||
default:
|
||||
health.NetworkStatus = "FAILED"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_failed",
|
||||
Severity: "warning",
|
||||
Description: "No physical interface obtained IPv4 connectivity.",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
for _, tool := range s.runtimeToolStatuses(vendor) {
|
||||
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
||||
Name: tool.Name,
|
||||
Path: tool.Path,
|
||||
OK: tool.OK,
|
||||
})
|
||||
if !tool.OK {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "tool_missing",
|
||||
Severity: "warning",
|
||||
Description: "Required tool missing: " + tool.Name,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for _, name := range runtimeTrackedServices {
|
||||
health.Services = append(health.Services, schema.RuntimeServiceStatus{
|
||||
Name: name,
|
||||
Status: s.ServiceState(name),
|
||||
})
|
||||
}
|
||||
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
s.collectToRAMHealth(&health)
|
||||
s.collectUSBExportHealth(&health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
}
|
||||
return health, nil
|
||||
}
|
||||
|
||||
func commandText(name string, args ...string) string {
|
||||
raw, err := exec.Command(name, args...).CombinedOutput()
|
||||
if err != nil && len(raw) == 0 {
|
||||
return ""
|
||||
}
|
||||
return string(raw)
|
||||
}
|
||||
|
||||
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
tools := s.CheckTools(runtimeRequiredTools)
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"dcgmi",
|
||||
"nv-hostengine",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
"bee-nccl-gpu-stress",
|
||||
"all_reduce_perf",
|
||||
})...)
|
||||
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||
tool.Path = cmd[0]
|
||||
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
|
||||
tool.Path = cmd[1]
|
||||
}
|
||||
tool.OK = true
|
||||
}
|
||||
tools = append(tools, tool)
|
||||
}
|
||||
return tools
|
||||
}
|
||||
|
||||
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||
for _, candidate := range candidates {
|
||||
path, err := exec.LookPath(candidate)
|
||||
if err == nil {
|
||||
return ToolStatus{Name: display, Path: path, OK: true}
|
||||
}
|
||||
}
|
||||
return ToolStatus{Name: display}
|
||||
}
|
||||
|
||||
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
||||
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
||||
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
||||
// "failed" = toram was requested but medium is not in RAM.
|
||||
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||
state := s.LiveMediaRAMState()
|
||||
health.ToRAMStatus = state.Status
|
||||
switch state.Status {
|
||||
case "ok":
|
||||
return
|
||||
case "failed":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_failed",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
case "partial":
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_partial",
|
||||
Severity: "warning",
|
||||
Description: state.Message,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
||||
// suitable for log export. Sets USBExportPath to the first match found.
|
||||
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
||||
health.USBExportPath = findUSBExportMount()
|
||||
}
|
||||
|
||||
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
||||
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
||||
// has USB transport. Returns "" if none found.
|
||||
func findUSBExportMount() string {
|
||||
f, err := os.Open("/proc/mounts")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// fs types that are expected on USB export drives
|
||||
exportFSTypes := map[string]bool{
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"fuseblk": true,
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
// fields: device mountpoint fstype options dump pass
|
||||
fields := strings.Fields(scanner.Text())
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||
if !exportFSTypes[strings.ToLower(fsType)] {
|
||||
continue
|
||||
}
|
||||
// Skip read-only mounts
|
||||
opts := strings.Split(options, ",")
|
||||
readOnly := false
|
||||
for _, o := range opts {
|
||||
if strings.TrimSpace(o) == "ro" {
|
||||
readOnly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if readOnly {
|
||||
continue
|
||||
}
|
||||
// Check USB transport via lsblk on the device (or its parent disk for partitions).
|
||||
if !strings.HasPrefix(device, "/dev/") {
|
||||
continue
|
||||
}
|
||||
checkDev := device
|
||||
// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
|
||||
// Strip trailing partition digits to get the parent disk name.
|
||||
if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
|
||||
checkDev = trimmed
|
||||
}
|
||||
if blockDeviceTransport(checkDev) == "usb" {
|
||||
return mountPoint
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_stuck",
|
||||
Severity: "critical",
|
||||
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||
})
|
||||
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_disabled",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||
})
|
||||
}
|
||||
}
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA kernel module is not loaded.",
|
||||
})
|
||||
}
|
||||
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_modeset_failed",
|
||||
Severity: "warning",
|
||||
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||
})
|
||||
}
|
||||
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "cuda_runtime_not_ready",
|
||||
Severity: "warning",
|
||||
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
case "amd":
|
||||
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "amdgpu_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "AMD GPU driver is not loaded.",
|
||||
})
|
||||
}
|
||||
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.CUDAReady = true
|
||||
health.DriverReady = true
|
||||
return
|
||||
}
|
||||
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "rocm_smi_unavailable",
|
||||
Severity: "warning",
|
||||
Description: "ROCm SMI is not available for AMD GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
965
audit/internal/platform/sat_fan_stress.go
Normal file
965
audit/internal/platform/sat_fan_stress.go
Normal file
@@ -0,0 +1,965 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FanStressOptions configures the fan-stress / thermal cycling test.
|
||||
type FanStressOptions struct {
|
||||
BaselineSec int // idle monitoring before and after load (default 30)
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
// FanReading holds one fan sensor reading.
|
||||
type FanReading struct {
|
||||
Name string
|
||||
RPM float64
|
||||
}
|
||||
|
||||
// GPUStressMetric holds per-GPU metrics during the stress test.
|
||||
type GPUStressMetric struct {
|
||||
Index int
|
||||
TempC float64
|
||||
UsagePct float64
|
||||
PowerW float64
|
||||
ClockMHz float64
|
||||
Throttled bool // true if any throttle reason is active
|
||||
}
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type fanObservationState struct {
|
||||
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||
}
|
||||
|
||||
type fanPeakCandidate struct {
|
||||
FirstSeen time.Time
|
||||
RPM float64
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
fanObservationMu sync.Mutex
|
||||
fanObservation fanObservationState
|
||||
fanObservationInit bool
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||
|
||||
const fanObservationMinPeakHold = time.Second
|
||||
|
||||
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||
if rpm <= 0 {
|
||||
return 0
|
||||
}
|
||||
return math.Ceil(rpm/1000.0) * 1000.0
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
applyFanStressDefaults(&opts)
|
||||
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "fan-stress-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
// Phase name shared between sampler goroutine and main goroutine.
|
||||
var phaseMu sync.Mutex
|
||||
currentPhase := "init"
|
||||
setPhase := func(name string) {
|
||||
phaseMu.Lock()
|
||||
currentPhase = name
|
||||
phaseMu.Unlock()
|
||||
}
|
||||
getPhase := func() string {
|
||||
phaseMu.Lock()
|
||||
defer phaseMu.Unlock()
|
||||
return currentPhase
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
var rowsMu sync.Mutex
|
||||
var allRows []FanStressRow
|
||||
|
||||
// Start background sampler (every second).
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
go func() {
|
||||
defer close(doneCh)
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
|
||||
rowsMu.Lock()
|
||||
allRows = append(allRows, row)
|
||||
rowsMu.Unlock()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
var summary strings.Builder
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
|
||||
stats := satStats{}
|
||||
|
||||
// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
|
||||
idlePhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
|
||||
)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durSec) * time.Second):
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
|
||||
)
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
|
||||
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(durSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||
stats.Failed++
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
}
|
||||
|
||||
// Execute test phases.
|
||||
idlePhase("baseline", "01-baseline", opts.BaselineSec)
|
||||
loadPhase("load1", "02-load1", opts.Phase1DurSec)
|
||||
idlePhase("pause", "03-pause", opts.PauseSec)
|
||||
loadPhase("load2", "04-load2", opts.Phase2DurSec)
|
||||
idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
|
||||
|
||||
// Stop sampler and collect rows.
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
rowsMu.Lock()
|
||||
rows := allRows
|
||||
rowsMu.Unlock()
|
||||
|
||||
// Analysis.
|
||||
throttled := analyzeThrottling(rows)
|
||||
maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
var m float64
|
||||
for _, g := range r.GPUs {
|
||||
if g.TempC > m {
|
||||
m = g.TempC
|
||||
}
|
||||
}
|
||||
return m
|
||||
})
|
||||
maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
return r.CPUMaxTempC
|
||||
})
|
||||
fanResponseSec := analyzeFanResponse(rows)
|
||||
|
||||
fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
|
||||
fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
|
||||
fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
|
||||
if fanResponseSec >= 0 {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
|
||||
}
|
||||
|
||||
// Throttling failure counts against overall result.
|
||||
if throttled {
|
||||
stats.Failed++
|
||||
}
|
||||
writeSATStats(&summary, stats)
|
||||
|
||||
// Write CSV outputs.
|
||||
if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
|
||||
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.BaselineSec <= 0 {
|
||||
opts.BaselineSec = 30
|
||||
}
|
||||
if opts.Phase1DurSec <= 0 {
|
||||
opts.Phase1DurSec = 300
|
||||
}
|
||||
if opts.PauseSec <= 0 {
|
||||
opts.PauseSec = 60
|
||||
}
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
|
||||
row := FanStressRow{
|
||||
TimestampUTC: time.Now().UTC().Format(time.RFC3339),
|
||||
ElapsedSec: elapsed,
|
||||
Phase: phase,
|
||||
}
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
return row
|
||||
}
|
||||
|
||||
// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
|
||||
// clock frequency, and active throttle reasons for each GPU.
|
||||
func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||
}
|
||||
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var metrics []GPUStressMetric
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
throttleVal := strings.TrimSpace(parts[5])
|
||||
// Throttled if active reasons bitmask is non-zero.
|
||||
throttled := throttleVal != "0x0000000000000000" &&
|
||||
throttleVal != "0x0" &&
|
||||
throttleVal != "0" &&
|
||||
throttleVal != "" &&
|
||||
throttleVal != "N/A"
|
||||
metrics = append(metrics, GPUStressMetric{
|
||||
Index: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
PowerW: parseGPUFloat(parts[3]),
|
||||
ClockMHz: parseGPUFloat(parts[4]),
|
||||
Throttled: throttled,
|
||||
})
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||
func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
updateFanObservation(fans, time.Now())
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
func loadFanObservationLocked() {
|
||||
if fanObservationInit {
|
||||
return
|
||||
}
|
||||
fanObservationInit = true
|
||||
fanObservation.MaxRPM = make(map[string]float64)
|
||||
raw, err := os.ReadFile(fanObservationStatePath)
|
||||
if err != nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted fanObservationState
|
||||
if json.Unmarshal(raw, &persisted) != nil {
|
||||
return
|
||||
}
|
||||
for name, rpm := range persisted.MaxRPM {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
fanObservation.MaxRPM[name] = rpm
|
||||
}
|
||||
}
|
||||
|
||||
func saveFanObservationLocked() {
|
||||
if len(fanObservation.MaxRPM) == 0 {
|
||||
return
|
||||
}
|
||||
dir := filepath.Dir(fanObservationStatePath)
|
||||
if dir == "" || dir == "." {
|
||||
dir = "/var/log/bee-sat"
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||
return
|
||||
}
|
||||
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||
}
|
||||
|
||||
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||
if len(fans) == 0 {
|
||||
return
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
changed := false
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
currentMax := fanObservation.MaxRPM[name]
|
||||
if fan.RPM <= currentMax {
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if cand, ok := fanPeakCandidates[name]; ok {
|
||||
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||
newMax := math.Max(cand.RPM, fan.RPM)
|
||||
if newMax > currentMax {
|
||||
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||
changed = true
|
||||
}
|
||||
delete(fanPeakCandidates, name)
|
||||
continue
|
||||
}
|
||||
if fan.RPM > cand.RPM {
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||
}
|
||||
continue
|
||||
}
|
||||
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||
}
|
||||
if changed {
|
||||
saveFanObservationLocked()
|
||||
}
|
||||
}
|
||||
|
||||
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
fanObservationMu.Lock()
|
||||
defer fanObservationMu.Unlock()
|
||||
loadFanObservationLocked()
|
||||
var samples []float64
|
||||
for _, fan := range fans {
|
||||
name := strings.TrimSpace(fan.Name)
|
||||
if name == "" || fan.RPM <= 0 {
|
||||
continue
|
||||
}
|
||||
maxRPM := fanObservation.MaxRPM[name]
|
||||
if maxRPM <= 0 {
|
||||
continue
|
||||
}
|
||||
pct := fan.RPM / maxRPM * 100.0
|
||||
if pct > 100 {
|
||||
pct = 100
|
||||
}
|
||||
if pct < 0 {
|
||||
pct = 0
|
||||
}
|
||||
samples = append(samples, pct)
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||
func parseFanSpeeds(raw string) []FanReading {
|
||||
var fans []FanReading
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
// Find the first field that contains "RPM" (either as a standalone unit or inline)
|
||||
rpmVal := 0.0
|
||||
found := false
|
||||
for _, p := range parts[1:] {
|
||||
p = strings.TrimSpace(p)
|
||||
if !strings.Contains(strings.ToUpper(p), "RPM") {
|
||||
continue
|
||||
}
|
||||
if strings.EqualFold(p, "RPM") {
|
||||
continue // unit-only column in old format; value is in previous field
|
||||
}
|
||||
val, err := parseFanRPMValue(p)
|
||||
if err == nil {
|
||||
rpmVal = val
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// Old format: unit "RPM" is in col[2], value is in col[1]
|
||||
if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
|
||||
if val, err := parseFanRPMValue(valStr); err == nil {
|
||||
rpmVal = val
|
||||
found = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
fans = append(fans, FanReading{Name: name, RPM: rpmVal})
|
||||
}
|
||||
return fans
|
||||
}
|
||||
|
||||
func parseFanRPMValue(raw string) (float64, error) {
|
||||
fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
|
||||
if len(fields) == 0 {
|
||||
return 0, strconv.ErrSyntax
|
||||
}
|
||||
return strconv.ParseFloat(fields[0], 64)
|
||||
}
|
||||
|
||||
func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil, err
|
||||
}
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(out, &doc); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
chips := make([]string, 0, len(doc))
|
||||
for chip := range doc {
|
||||
chips = append(chips, chip)
|
||||
}
|
||||
sort.Strings(chips)
|
||||
var fans []FanReading
|
||||
seen := map[string]struct{}{}
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
names := make([]string, 0, len(features))
|
||||
for name := range features {
|
||||
names = append(names, name)
|
||||
}
|
||||
sort.Strings(names)
|
||||
for _, name := range names {
|
||||
feature, ok := features[name].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
rpm, ok := firstFanInputValue(feature)
|
||||
if !ok || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
label := strings.TrimSpace(name)
|
||||
if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
|
||||
label = chip + " / " + label
|
||||
}
|
||||
if _, ok := seen[label]; ok {
|
||||
continue
|
||||
}
|
||||
seen[label] = struct{}{}
|
||||
fans = append(fans, FanReading{Name: label, RPM: rpm})
|
||||
}
|
||||
}
|
||||
return fans, nil
|
||||
}
|
||||
|
||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||
// Returns the average duty cycle across all exposed PWM controls.
|
||||
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
fans, fanErr := sampleFanSpeeds()
|
||||
if fanErr != nil {
|
||||
return 0, false, false
|
||||
}
|
||||
return sampleFanDutyCyclePctFromFans(fans)
|
||||
}
|
||||
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||
return pct, ok, false
|
||||
}
|
||||
|
||||
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||
if len(fans) == 0 {
|
||||
return 0, false, false
|
||||
}
|
||||
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||
return pct, true, true
|
||||
}
|
||||
return 0, false, false
|
||||
}
|
||||
|
||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
return 0, false
|
||||
}
|
||||
var samples []float64
|
||||
for _, features := range doc {
|
||||
for name, feature := range features {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
featureMap, ok := feature.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if duty, ok := firstFanDutyValue(name, featureMap); ok {
|
||||
samples = append(samples, duty)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
|
||||
featureName = strings.ToLower(strings.TrimSpace(featureName))
|
||||
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
|
||||
return 0, false
|
||||
}
|
||||
if strings.Contains(featureName, "pwm") {
|
||||
for _, key := range []string{"input", "value", "current"} {
|
||||
if value, ok := feature[key]; ok {
|
||||
if duty, parsed := parseFanDutyValue(value); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "pwm") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
|
||||
continue
|
||||
}
|
||||
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func parseFanDutyValue(value any) (float64, bool) {
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return normalizePWMAsDutyPct(v)
|
||||
case string:
|
||||
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
|
||||
return normalizePWMAsDutyPct(f)
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
|
||||
if raw < 0 {
|
||||
return 0, false
|
||||
}
|
||||
if raw <= 100 {
|
||||
return raw, true
|
||||
}
|
||||
if raw <= 255 {
|
||||
return raw / 255.0 * 100.0, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
|
||||
continue
|
||||
}
|
||||
switch value := feature[key].(type) {
|
||||
case float64:
|
||||
return value, true
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(value, 64)
|
||||
if err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||
func sampleCPUMaxTemp() float64 {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||
if err != nil {
|
||||
return sampleCPUTempViaSensors()
|
||||
}
|
||||
return parseIPMIMaxTemp(string(out))
|
||||
}
|
||||
|
||||
// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
|
||||
func parseIPMIMaxTemp(raw string) float64 {
|
||||
var max float64
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
if !strings.Contains(strings.ToLower(unit), "degrees") {
|
||||
continue
|
||||
}
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if strings.EqualFold(valStr, "na") || valStr == "" {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(valStr, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
|
||||
func sampleCPUTempViaSensors() float64 {
|
||||
out, err := exec.Command("sensors", "-u").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var max float64
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
if !strings.HasSuffix(fields[0], "_input:") {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > 0 && val < 150 && val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
now := time.Now()
|
||||
current := 0.0
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err == nil {
|
||||
current = parseDCMIPowerReading(string(out))
|
||||
}
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||
systemPowerCache = updated
|
||||
return value
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
// Sample: " Instantaneous power reading: 500 Watts"
|
||||
func parseDCMIPowerReading(raw string) float64 {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if !strings.Contains(strings.ToLower(line), "instantaneous") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
for i, p := range parts {
|
||||
if strings.EqualFold(p, "Watts") && i > 0 {
|
||||
val, err := strconv.ParseFloat(parts[i-1], 64)
|
||||
if err == nil {
|
||||
return val
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
return cache.Value, cache
|
||||
}
|
||||
return 0, cache
|
||||
}
|
||||
|
||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||
// during either load phase.
|
||||
func analyzeThrottling(rows []FanStressRow) bool {
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" && row.Phase != "load2" {
|
||||
continue
|
||||
}
|
||||
for _, gpu := range row.GPUs {
|
||||
if gpu.Throttled {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
|
||||
func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
|
||||
var max float64
|
||||
for _, row := range rows {
|
||||
if v := extract(row); v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// analyzeFanResponse returns the seconds from load1 start until fan RPM first
|
||||
// increased by more than 5% above the baseline average. Returns -1 if undetermined.
|
||||
func analyzeFanResponse(rows []FanStressRow) float64 {
|
||||
// Compute baseline average fan RPM.
|
||||
var baseTotal, baseCount float64
|
||||
for _, row := range rows {
|
||||
if row.Phase != "baseline" {
|
||||
continue
|
||||
}
|
||||
for _, f := range row.Fans {
|
||||
baseTotal += f.RPM
|
||||
baseCount++
|
||||
}
|
||||
}
|
||||
if baseCount == 0 || baseTotal == 0 {
|
||||
return -1
|
||||
}
|
||||
baseAvg := baseTotal / baseCount
|
||||
threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
|
||||
|
||||
// Find elapsed time when load1 started.
|
||||
var load1Start float64 = -1
|
||||
for _, row := range rows {
|
||||
if row.Phase == "load1" {
|
||||
load1Start = row.ElapsedSec
|
||||
break
|
||||
}
|
||||
}
|
||||
if load1Start < 0 {
|
||||
return -1
|
||||
}
|
||||
|
||||
// Find first load1 row where average RPM crosses the threshold.
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" {
|
||||
continue
|
||||
}
|
||||
var total, count float64
|
||||
for _, f := range row.Fans {
|
||||
total += f.RPM
|
||||
count++
|
||||
}
|
||||
if count > 0 && total/count >= threshold {
|
||||
return row.ElapsedSec - load1Start
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
|
||||
// GPU columns are generated per index in gpuIndices order.
|
||||
func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
|
||||
if len(rows) == 0 {
|
||||
return os.WriteFile(path, []byte("no data\n"), 0644)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header: fixed system columns + per-GPU columns.
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
|
||||
for _, idx := range gpuIndices {
|
||||
fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
|
||||
idx, idx, idx, idx, idx)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
|
||||
for _, row := range rows {
|
||||
favg, fmin, fmax := fanRPMStats(row.Fans)
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
|
||||
row.TimestampUTC,
|
||||
row.ElapsedSec,
|
||||
row.Phase,
|
||||
favg, fmin, fmax,
|
||||
row.CPUMaxTempC,
|
||||
row.SysPowerW,
|
||||
)
|
||||
gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
|
||||
for _, g := range row.GPUs {
|
||||
gpuByIdx[g.Index] = g
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
g := gpuByIdx[idx]
|
||||
throttled := 0
|
||||
if g.Throttled {
|
||||
throttled = 1
|
||||
}
|
||||
fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
|
||||
g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
|
||||
func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
|
||||
var b strings.Builder
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
|
||||
for _, row := range rows {
|
||||
for _, f := range row.Fans {
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
|
||||
row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
|
||||
}
|
||||
}
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// fanRPMStats computes average, min, max RPM across all fans in a sample row.
|
||||
func fanRPMStats(fans []FanReading) (avg, min, max float64) {
|
||||
if len(fans) == 0 {
|
||||
return 0, 0, 0
|
||||
}
|
||||
min = fans[0].RPM
|
||||
max = fans[0].RPM
|
||||
var total float64
|
||||
for _, f := range fans {
|
||||
total += f.RPM
|
||||
if f.RPM < min {
|
||||
min = f.RPM
|
||||
}
|
||||
if f.RPM > max {
|
||||
max = f.RPM
|
||||
}
|
||||
}
|
||||
return total / float64(len(fans)), min, max
|
||||
}
|
||||
136
audit/internal/platform/sat_fan_stress_test.go
Normal file
136
audit/internal/platform/sat_fan_stress_test.go
Normal file
@@ -0,0 +1,136 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParseFanSpeeds(t *testing.T) {
|
||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||
got := parseFanSpeeds(raw)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("fans=%d want 2 (%v)", len(got), got)
|
||||
}
|
||||
if got[0].Name != "FAN1" || got[0].RPM != 2400 {
|
||||
t.Fatalf("fan0=%+v", got[0])
|
||||
}
|
||||
if got[1].Name != "FAN2" || got[1].RPM != 1800 {
|
||||
t.Fatalf("fan1=%+v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFirstFanInputValue(t *testing.T) {
|
||||
feature := map[string]any{
|
||||
"fan1_input": 9200.0,
|
||||
}
|
||||
got, ok := firstFanInputValue(feature)
|
||||
if !ok || got != 9200 {
|
||||
t.Fatalf("got=%v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||
raw := []byte(`{
|
||||
"chip0": {
|
||||
"fan1": {"input": 9000},
|
||||
"pwm1": {"input": 128},
|
||||
"pwm1_enable": {"input": 1}
|
||||
},
|
||||
"chip1": {
|
||||
"pwm2": {"input": 64}
|
||||
}
|
||||
}`)
|
||||
|
||||
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
|
||||
if !ok {
|
||||
t.Fatalf("expected duty cycle telemetry to be parsed")
|
||||
}
|
||||
if got < 57 || got > 58 {
|
||||
t.Fatalf("got=%v want ~57.1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldPath := fanObservationStatePath
|
||||
oldState := fanObservation
|
||||
oldInit := fanObservationInit
|
||||
oldCandidates := fanPeakCandidates
|
||||
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
t.Cleanup(func() {
|
||||
fanObservationStatePath = oldPath
|
||||
fanObservation = oldState
|
||||
fanObservationInit = oldInit
|
||||
fanPeakCandidates = oldCandidates
|
||||
})
|
||||
|
||||
start := time.Unix(100, 0)
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||
t.Fatalf("single-sample spike should not establish observed max")
|
||||
}
|
||||
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||
|
||||
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("got=%v want ~43.3", got)
|
||||
}
|
||||
|
||||
fanObservation = fanObservationState{}
|
||||
fanObservationInit = false
|
||||
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||
if !ok {
|
||||
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||
}
|
||||
if got < 43 || got > 44 {
|
||||
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
Minimum during sampling period: 498 Watts
|
||||
`
|
||||
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
if updated.Value != 480 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
if updated.Value != 530 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
}
|
||||
@@ -1,20 +1,25 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1")
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda")
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
}
|
||||
@@ -25,21 +30,68 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
|
||||
jobs := nvidiaSATJobs()
|
||||
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
if len(jobs) != 6 {
|
||||
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
||||
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := amdStressRVSConfig(123)
|
||||
if !strings.Contains(cfg, "module: gst") {
|
||||
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||
}
|
||||
if strings.Contains(cfg, "module: mem") {
|
||||
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||
}
|
||||
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||
}
|
||||
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||
if !strings.Contains(cfg, field) {
|
||||
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||
if len(jobs) != 4 {
|
||||
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||
}
|
||||
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||
}
|
||||
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||
}
|
||||
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
||||
got := jobs[5].cmd
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
}
|
||||
@@ -50,6 +102,270 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 600,
|
||||
Loader: NvidiaStressLoaderJohn,
|
||||
ExcludeGPUIndices: []int{1},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 120,
|
||||
Loader: NvidiaStressLoaderNCCL,
|
||||
GPUIndices: []int{2, 0},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
got, err := resolveDCGMGPUIndices(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "0,1,2"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "1,3"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len=%d want 2", len(got))
|
||||
}
|
||||
if got[0].NeedsReset {
|
||||
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||
}
|
||||
if !got[1].NeedsReset {
|
||||
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
msg, err := checkNvidiaJobHealth([]int{1})
|
||||
if err == nil {
|
||||
t.Fatal("expected health check error")
|
||||
}
|
||||
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||
t.Fatalf("unexpected message: %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||
0: {Index: 0, RunStatus: "OK"},
|
||||
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||
}
|
||||
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
if !strings.Contains(text, "run_status=FAILED") {
|
||||
t.Fatalf("missing run status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||
t.Fatalf("missing health status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||
t.Fatalf("missing failing job:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
switch file {
|
||||
case "dcgmproftester13":
|
||||
return "/usr/bin/dcgmproftester13", nil
|
||||
default:
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 4 {
|
||||
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/dcgmproftester13" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
|
||||
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||
}
|
||||
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||
}
|
||||
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||
{loader: "", want: "gpu-nvidia-burn"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvIntFallback(t *testing.T) {
|
||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
@@ -65,6 +381,37 @@ func TestEnvIntFallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "65536M" {
|
||||
t.Fatalf("sizeArg=%q want 65536M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
|
||||
|
||||
if got := memoryStressSizeArg(); got != "4096M" {
|
||||
t.Fatalf("sizeArg=%q want 4096M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 0 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "80%" {
|
||||
t.Fatalf("sizeArg=%q want 80%%", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifySATResult(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -75,7 +422,9 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -87,3 +436,160 @@ func TestClassifySATResult(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
cancel()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||
}, nil)
|
||||
<-done
|
||||
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Fatalf("err=%v want context.Canceled", err)
|
||||
}
|
||||
if archive != "" {
|
||||
t.Fatalf("archive=%q want empty", archive)
|
||||
}
|
||||
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||
if globErr != nil {
|
||||
t.Fatalf("Glob error: %v", globErr)
|
||||
}
|
||||
if len(matches) != 0 {
|
||||
t.Fatalf("archives=%v want none", matches)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
|
||||
got := parseStorageDevices(raw)
|
||||
want := []string{"/dev/nvme0n1", "/dev/sdb"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||
t.Setenv("PATH", t.TempDir())
|
||||
|
||||
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
|
||||
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showproductname")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != toolPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "stress-ng" {
|
||||
return "/usr/bin/stress-ng", nil
|
||||
}
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveSATCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 3 {
|
||||
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/stress-ng" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||
t.Fatalf("error=%q", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
|
||||
t.Fatalf("mkdir: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
oldGlob := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
rocmSMIExecutableGlobs = []string{execPath}
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
rocmSMIExecutableGlobs = oldGlob
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
t.Setenv("PATH", "")
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showallinfo")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != execPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
oldExecGlobs := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
|
||||
rocmSMIExecutableGlobs = nil
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
satLookPath = oldLookPath
|
||||
rocmSMIExecutableGlobs = oldExecGlobs
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
if _, err := runROCmSMI("--showproductname"); err == nil {
|
||||
t.Fatal("expected missing rocm-smi error")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,13 +10,30 @@ import (
|
||||
func (s *System) ListBeeServices() ([]string, error) {
|
||||
seen := map[string]bool{}
|
||||
var out []string
|
||||
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
||||
for _, pattern := range []string{
|
||||
"/etc/systemd/system/bee-*.service",
|
||||
"/lib/systemd/system/bee-*.service",
|
||||
"/etc/systemd/system/bee-*.timer",
|
||||
"/lib/systemd/system/bee-*.timer",
|
||||
} {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, match := range matches {
|
||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||
base := filepath.Base(match)
|
||||
name := base
|
||||
if strings.HasSuffix(base, ".service") {
|
||||
name = strings.TrimSuffix(base, ".service")
|
||||
}
|
||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||
if strings.HasSuffix(name, "@") {
|
||||
continue
|
||||
}
|
||||
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
|
||||
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
|
||||
continue
|
||||
}
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
out = append(out, name)
|
||||
@@ -44,7 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
|
||||
151
audit/internal/platform/techdump.go
Normal file
151
audit/internal/platform/techdump.go
Normal file
@@ -0,0 +1,151 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var techDumpFixedCommands = []struct {
|
||||
Name string
|
||||
Args []string
|
||||
File string
|
||||
}{
|
||||
{Name: "dmidecode", Args: []string{"-t", "0"}, File: "dmidecode-type0.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "1"}, File: "dmidecode-type1.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "2"}, File: "dmidecode-type2.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
|
||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
var techDumpNvidiaCommands = []struct {
|
||||
Name string
|
||||
Args []string
|
||||
File string
|
||||
}{
|
||||
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||
}
|
||||
|
||||
type lsblkDumpRoot struct {
|
||||
Blockdevices []struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Tran string `json:"tran"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
|
||||
type nvmeDumpRoot struct {
|
||||
Devices []struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
} `json:"Devices"`
|
||||
}
|
||||
|
||||
func (s *System) CaptureTechnicalDump(baseDir string) error {
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, cmd := range techDumpFixedCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
switch s.DetectGPUVendor() {
|
||||
case "nvidia":
|
||||
for _, cmd := range techDumpNvidiaCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
case "amd":
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
|
||||
}
|
||||
|
||||
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
|
||||
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
|
||||
}
|
||||
for _, dev := range nvmeDumpDevices(filepath.Join(baseDir, "nvme-list.json")) {
|
||||
writeCommandDump(filepath.Join(baseDir, "nvme-id-ctrl-"+sanitizeDumpName(dev)+".json"), "nvme", "id-ctrl", dev, "-o", "json")
|
||||
writeCommandDump(filepath.Join(baseDir, "nvme-smart-log-"+sanitizeDumpName(dev)+".json"), "nvme", "smart-log", dev, "-o", "json")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeCommandDump(path, name string, args ...string) {
|
||||
out, err := exec.Command(name, args...).CombinedOutput()
|
||||
if err != nil && len(out) == 0 {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func writeROCmSMIDump(path string, args ...string) {
|
||||
out, err := runROCmSMI(args...)
|
||||
if err != nil && len(out) == 0 {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func lsblkDumpDevices(path string) []string {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var root lsblkDumpRoot
|
||||
if err := json.Unmarshal(raw, &root); err != nil {
|
||||
return nil
|
||||
}
|
||||
var devices []string
|
||||
for _, dev := range root.Blockdevices {
|
||||
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
|
||||
continue
|
||||
}
|
||||
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
|
||||
devices = append(devices, strings.TrimSpace(dev.Name))
|
||||
}
|
||||
}
|
||||
sort.Strings(devices)
|
||||
return devices
|
||||
}
|
||||
|
||||
func nvmeDumpDevices(path string) []string {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var root nvmeDumpRoot
|
||||
if err := json.Unmarshal(raw, &root); err != nil {
|
||||
return nil
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
var devices []string
|
||||
for _, dev := range root.Devices {
|
||||
name := strings.TrimSpace(dev.DevicePath)
|
||||
if name == "" || seen[name] {
|
||||
continue
|
||||
}
|
||||
seen[name] = true
|
||||
devices = append(devices, name)
|
||||
}
|
||||
sort.Strings(devices)
|
||||
return devices
|
||||
}
|
||||
|
||||
func sanitizeDumpName(value string) string {
|
||||
value = strings.TrimSpace(value)
|
||||
value = strings.TrimPrefix(value, "/dev/")
|
||||
value = strings.ReplaceAll(value, "/", "_")
|
||||
if value == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return value
|
||||
}
|
||||
48
audit/internal/platform/techdump_test.go
Normal file
48
audit/internal/platform/techdump_test.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLSBLKDumpDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "lsblk.json")
|
||||
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write lsblk fixture: %v", err)
|
||||
}
|
||||
|
||||
got := lsblkDumpDevices(path)
|
||||
want := []string{"nvme0n1", "sdb"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNVMEDumpDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "nvme-list.json")
|
||||
if err := os.WriteFile(path, []byte(`{"Devices":[{"DevicePath":"/dev/nvme1n1"},{"DevicePath":"/dev/nvme0n1"},{"DevicePath":"/dev/nvme1n1"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write nvme fixture: %v", err)
|
||||
}
|
||||
|
||||
got := nvmeDumpDevices(path)
|
||||
want := []string{"/dev/nvme0n1", "/dev/nvme1n1"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("nvmeDumpDevices=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSanitizeDumpName(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := sanitizeDumpName("/dev/nvme0n1"); got != "nvme0n1" {
|
||||
t.Fatalf("sanitizeDumpName=%q want nvme0n1", got)
|
||||
}
|
||||
}
|
||||
@@ -2,12 +2,42 @@ package platform
|
||||
|
||||
type System struct{}
|
||||
|
||||
type LiveBootSource struct {
|
||||
InRAM bool `json:"in_ram"`
|
||||
Kind string `json:"kind"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type LiveMediaRAMState struct {
|
||||
LiveBootSource
|
||||
State string `json:"state"`
|
||||
Status string `json:"status"`
|
||||
ToramActive bool `json:"toram_active,omitempty"`
|
||||
CopyPresent bool `json:"copy_present,omitempty"`
|
||||
CopyComplete bool `json:"copy_complete,omitempty"`
|
||||
CanStartCopy bool `json:"can_start_copy,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
IPv4 []string
|
||||
}
|
||||
|
||||
type NetworkInterfaceSnapshot struct {
|
||||
Name string
|
||||
Up bool
|
||||
IPv4 []string
|
||||
}
|
||||
|
||||
type NetworkSnapshot struct {
|
||||
Interfaces []NetworkInterfaceSnapshot
|
||||
DefaultRoutes []string
|
||||
ResolvConf string
|
||||
}
|
||||
|
||||
type ServiceAction string
|
||||
|
||||
const (
|
||||
@@ -25,12 +55,12 @@ type StaticIPv4Config struct {
|
||||
}
|
||||
|
||||
type RemovableTarget struct {
|
||||
Device string
|
||||
FSType string
|
||||
Size string
|
||||
Label string
|
||||
Model string
|
||||
Mountpoint string
|
||||
Device string `json:"device"`
|
||||
FSType string `json:"fs_type"`
|
||||
Size string `json:"size"`
|
||||
Label string `json:"label"`
|
||||
Model string `json:"model"`
|
||||
Mountpoint string `json:"mountpoint"`
|
||||
}
|
||||
|
||||
type ToolStatus struct {
|
||||
@@ -39,6 +69,21 @@ type ToolStatus struct {
|
||||
OK bool
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaStressLoaderBuiltin = "builtin"
|
||||
NvidiaStressLoaderJohn = "john"
|
||||
NvidiaStressLoaderNCCL = "nccl"
|
||||
)
|
||||
|
||||
type NvidiaStressOptions struct {
|
||||
DurationSec int
|
||||
SizeMB int
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
StaggerSeconds int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
return &System{}
|
||||
}
|
||||
|
||||
31
audit/internal/platform/types_test.go
Normal file
31
audit/internal/platform/types_test.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
data, err := json.Marshal(RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
FSType: "exfat",
|
||||
Size: "1.8T",
|
||||
Label: "USB",
|
||||
Model: "Flash",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
raw := string(data)
|
||||
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
|
||||
if !strings.Contains(raw, key) {
|
||||
t.Fatalf("json missing key %s: %s", key, raw)
|
||||
}
|
||||
}
|
||||
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
|
||||
t.Fatalf("json still contains Go field names: %s", raw)
|
||||
}
|
||||
}
|
||||
@@ -10,9 +10,52 @@ type HardwareIngestRequest struct {
|
||||
Protocol *string `json:"protocol,omitempty"`
|
||||
TargetHost *string `json:"target_host,omitempty"`
|
||||
CollectedAt string `json:"collected_at"`
|
||||
Runtime *RuntimeHealth `json:"runtime,omitempty"`
|
||||
Hardware HardwareSnapshot `json:"hardware"`
|
||||
}
|
||||
|
||||
type RuntimeHealth struct {
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
Interfaces []RuntimeInterface `json:"interfaces,omitempty"`
|
||||
}
|
||||
|
||||
type RuntimeIssue struct {
|
||||
Code string `json:"code"`
|
||||
Severity string `json:"severity,omitempty"`
|
||||
Description string `json:"description"`
|
||||
}
|
||||
|
||||
type RuntimeToolStatus struct {
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path,omitempty"`
|
||||
OK bool `json:"ok"`
|
||||
}
|
||||
|
||||
type RuntimeServiceStatus struct {
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
type RuntimeInterface struct {
|
||||
Name string `json:"name"`
|
||||
State string `json:"state,omitempty"`
|
||||
IPv4 []string `json:"ipv4,omitempty"`
|
||||
Outcome string `json:"outcome,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareSnapshot struct {
|
||||
Board HardwareBoard `json:"board"`
|
||||
Firmware []HardwareFirmwareRecord `json:"firmware,omitempty"`
|
||||
@@ -22,6 +65,7 @@ type HardwareSnapshot struct {
|
||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -143,12 +187,19 @@ type HardwarePCIeDevice struct {
|
||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||
SFPVoltageV *float64 `json:"sfp_voltage_v,omitempty"`
|
||||
SFPBiasMA *float64 `json:"sfp_bias_ma,omitempty"`
|
||||
BDF *string `json:"bdf,omitempty"`
|
||||
BDF *string `json:"-"`
|
||||
DeviceClass *string `json:"device_class,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
@@ -183,11 +234,12 @@ type HardwarePowerSupply struct {
|
||||
}
|
||||
|
||||
type HardwareComponentStatus struct {
|
||||
Status *string `json:"status,omitempty"`
|
||||
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
|
||||
StatusChangedAt *string `json:"status_changed_at,omitempty"`
|
||||
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
|
||||
StatusChangedAt *string `json:"status_changed_at,omitempty"`
|
||||
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
ManufacturedYearWeek *string `json:"manufactured_year_week,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareStatusHistory struct {
|
||||
@@ -235,3 +287,15 @@ type HardwareOtherSensor struct {
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareEventLog struct {
|
||||
Source string `json:"source"`
|
||||
EventTime *string `json:"event_time,omitempty"`
|
||||
Severity *string `json:"severity,omitempty"`
|
||||
MessageID *string `json:"message_id,omitempty"`
|
||||
Message string `json:"message"`
|
||||
ComponentRef *string `json:"component_ref,omitempty"`
|
||||
Fingerprint *string `json:"fingerprint,omitempty"`
|
||||
IsActive *bool `json:"is_active,omitempty"`
|
||||
RawPayload map[string]any `json:"raw_payload,omitempty"`
|
||||
}
|
||||
|
||||
46
audit/internal/schema/hardware_test.go
Normal file
46
audit/internal/schema/hardware_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package schema
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
||||
week := "2024-W07"
|
||||
eventTime := "2026-03-15T14:03:11Z"
|
||||
message := "Correctable ECC error threshold exceeded"
|
||||
|
||||
payload := HardwareIngestRequest{
|
||||
CollectedAt: "2026-03-15T15:00:00Z",
|
||||
Hardware: HardwareSnapshot{
|
||||
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||
CPUs: []HardwareCPU{
|
||||
{
|
||||
HardwareComponentStatus: HardwareComponentStatus{
|
||||
ManufacturedYearWeek: &week,
|
||||
},
|
||||
},
|
||||
},
|
||||
EventLogs: []HardwareEventLog{
|
||||
{
|
||||
Source: "bmc",
|
||||
EventTime: &eventTime,
|
||||
Message: message,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
if !strings.Contains(text, `"manufactured_year_week":"2024-W07"`) {
|
||||
t.Fatalf("missing manufactured_year_week: %s", text)
|
||||
}
|
||||
if !strings.Contains(text, `"event_logs":[{"source":"bmc","event_time":"2026-03-15T14:03:11Z","message":"Correctable ECC error threshold exceeded"}]`) {
|
||||
t.Fatalf("missing event_logs payload: %s", text)
|
||||
}
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) updateStaticForm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "esc":
|
||||
m.screen = screenNetwork
|
||||
m.formFields = nil
|
||||
m.formIndex = 0
|
||||
return m, nil
|
||||
case "up", "shift+tab":
|
||||
if m.formIndex > 0 {
|
||||
m.formIndex--
|
||||
}
|
||||
case "down", "tab":
|
||||
if m.formIndex < len(m.formFields)-1 {
|
||||
m.formIndex++
|
||||
}
|
||||
case "enter":
|
||||
if m.formIndex < len(m.formFields)-1 {
|
||||
m.formIndex++
|
||||
return m, nil
|
||||
}
|
||||
cfg := m.app.ParseStaticIPv4Config(m.selectedIface, []string{
|
||||
m.formFields[0].Value,
|
||||
m.formFields[1].Value,
|
||||
m.formFields[2].Value,
|
||||
m.formFields[3].Value,
|
||||
})
|
||||
m.busy = true
|
||||
m.busyTitle = "Static IPv4: " + m.selectedIface
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.SetStaticIPv4Result(cfg)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case "backspace":
|
||||
field := &m.formFields[m.formIndex]
|
||||
if len(field.Value) > 0 {
|
||||
field.Value = field.Value[:len(field.Value)-1]
|
||||
}
|
||||
default:
|
||||
if msg.Type == tea.KeyRunes && len(msg.Runes) > 0 {
|
||||
m.formFields[m.formIndex].Value += string(msg.Runes)
|
||||
}
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "left", "up", "tab":
|
||||
if m.cursor > 0 {
|
||||
m.cursor--
|
||||
}
|
||||
case "right", "down":
|
||||
if m.cursor < 1 {
|
||||
m.cursor++
|
||||
}
|
||||
case "esc":
|
||||
m.screen = m.confirmCancelTarget()
|
||||
m.cursor = 0
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
case "enter":
|
||||
if m.cursor == 1 {
|
||||
m.screen = m.confirmCancelTarget()
|
||||
m.cursor = 0
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
}
|
||||
m.busy = true
|
||||
switch m.pendingAction {
|
||||
case actionExportAudit:
|
||||
m.busyTitle = "Export audit"
|
||||
target := *m.selectedTarget
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.ExportLatestAuditResult(target)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
|
||||
}
|
||||
case actionRunNvidiaSAT:
|
||||
m.busyTitle = "NVIDIA SAT"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunNvidiaAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
|
||||
}
|
||||
case actionRunMemorySAT:
|
||||
m.busyTitle = "Memory SAT"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunMemoryAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
|
||||
}
|
||||
case actionRunStorageSAT:
|
||||
m.busyTitle = "Storage SAT"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunStorageAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
|
||||
}
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) confirmCancelTarget() screen {
|
||||
switch m.pendingAction {
|
||||
case actionExportAudit:
|
||||
return screenExportTargets
|
||||
case actionRunNvidiaSAT:
|
||||
fallthrough
|
||||
case actionRunMemorySAT:
|
||||
fallthrough
|
||||
case actionRunStorageSAT:
|
||||
return screenAcceptance
|
||||
default:
|
||||
return screenMain
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
package tui
|
||||
|
||||
import "bee/audit/internal/platform"
|
||||
|
||||
type resultMsg struct {
|
||||
title string
|
||||
body string
|
||||
err error
|
||||
back screen
|
||||
}
|
||||
|
||||
type servicesMsg struct {
|
||||
services []string
|
||||
err error
|
||||
}
|
||||
|
||||
type interfacesMsg struct {
|
||||
ifaces []platform.InterfaceInfo
|
||||
err error
|
||||
}
|
||||
|
||||
type exportTargetsMsg struct {
|
||||
targets []platform.RemovableTarget
|
||||
err error
|
||||
}
|
||||
|
||||
type bannerMsg struct {
|
||||
text string
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) handleAcceptanceMenu() (tea.Model, tea.Cmd) {
|
||||
if m.cursor == 3 {
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.pendingAction = actionRunNvidiaSAT
|
||||
case 1:
|
||||
m.pendingAction = actionRunMemorySAT
|
||||
case 2:
|
||||
m.pendingAction = actionRunStorageSAT
|
||||
}
|
||||
m.screen = screenConfirm
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.targets) == 0 {
|
||||
return m, resultCmd("Export audit", "No removable filesystems found", nil, screenMain)
|
||||
}
|
||||
target := m.targets[m.cursor]
|
||||
m.selectedTarget = &target
|
||||
m.pendingAction = actionExportAudit
|
||||
m.screen = screenConfirm
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,63 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.screen = screenNetwork
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case 1:
|
||||
m.busy = true
|
||||
m.busyTitle = "Services"
|
||||
return m, func() tea.Msg {
|
||||
services, err := m.app.ListBeeServices()
|
||||
return servicesMsg{services: services, err: err}
|
||||
}
|
||||
case 2:
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case 3:
|
||||
m.busy = true
|
||||
m.busyTitle = "Run audit"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunAuditNow(m.runtimeMode)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
|
||||
}
|
||||
case 4:
|
||||
m.busy = true
|
||||
m.busyTitle = "Export audit"
|
||||
return m, func() tea.Msg {
|
||||
targets, err := m.app.ListRemovableTargets()
|
||||
return exportTargetsMsg{targets: targets, err: err}
|
||||
}
|
||||
case 5:
|
||||
m.busy = true
|
||||
m.busyTitle = "Required tools"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.ToolCheckResult([]string{"dmidecode", "smartctl", "nvme", "ipmitool", "lspci", "ethtool", "bee", "nvidia-smi", "bee-gpu-stress", "memtester", "dhclient", "lsblk", "mount"})
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenMain}
|
||||
}
|
||||
case 6:
|
||||
m.busy = true
|
||||
m.busyTitle = "Health summary"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.HealthSummaryResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenMain}
|
||||
}
|
||||
case 7:
|
||||
m.busy = true
|
||||
m.busyTitle = "Audit logs"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.AuditLogTailResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenMain}
|
||||
}
|
||||
case 8:
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.busy = true
|
||||
m.busyTitle = "Network status"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.NetworkStatus()
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case 1:
|
||||
m.busy = true
|
||||
m.busyTitle = "DHCP all interfaces"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.DHCPAllResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case 2:
|
||||
m.pendingAction = actionDHCPOne
|
||||
m.busy = true
|
||||
m.busyTitle = "Interfaces"
|
||||
return m, func() tea.Msg {
|
||||
ifaces, err := m.app.ListInterfaces()
|
||||
return interfacesMsg{ifaces: ifaces, err: err}
|
||||
}
|
||||
case 3:
|
||||
m.pendingAction = actionStaticIPv4
|
||||
m.busy = true
|
||||
m.busyTitle = "Interfaces"
|
||||
return m, func() tea.Msg {
|
||||
ifaces, err := m.app.ListInterfaces()
|
||||
return interfacesMsg{ifaces: ifaces, err: err}
|
||||
}
|
||||
case 4:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) handleInterfacePickMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.interfaces) == 0 {
|
||||
return m, resultCmd("interfaces", "No physical interfaces found", nil, screenNetwork)
|
||||
}
|
||||
m.selectedIface = m.interfaces[m.cursor].Name
|
||||
switch m.pendingAction {
|
||||
case actionDHCPOne:
|
||||
m.busy = true
|
||||
m.busyTitle = "DHCP on " + m.selectedIface
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.DHCPOneResult(m.selectedIface)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case actionStaticIPv4:
|
||||
defaults := m.app.DefaultStaticIPv4FormFields(m.selectedIface)
|
||||
m.formFields = []formField{
|
||||
{Label: "IPv4 address", Value: defaults[0]},
|
||||
{Label: "Prefix", Value: defaults[1]},
|
||||
{Label: "Gateway", Value: strings.TrimSpace(defaults[2])},
|
||||
{Label: "DNS (space-separated)", Value: defaults[3]},
|
||||
}
|
||||
m.formIndex = 0
|
||||
m.screen = screenStaticForm
|
||||
return m, nil
|
||||
default:
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleServicesMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.services) == 0 {
|
||||
return m, resultCmd("Services", "No bee-* services found.", nil, screenMain)
|
||||
}
|
||||
m.selectedService = m.services[m.cursor]
|
||||
m.screen = screenServiceAction
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) handleServiceActionMenu() (tea.Model, tea.Cmd) {
|
||||
action := m.serviceMenu[m.cursor]
|
||||
if action == "back" {
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
m.busy = true
|
||||
m.busyTitle = "service: " + m.selectedService
|
||||
return m, func() tea.Msg {
|
||||
switch action {
|
||||
case "Status":
|
||||
result, err := m.app.ServiceStatusResult(m.selectedService)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Restart":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceRestart)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Start":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStart)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Stop":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStop)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
default:
|
||||
return resultMsg{title: "Service", body: "Unknown action.", back: screenServiceAction}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,588 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func newTestModel() model {
|
||||
return newModel(app.New(platform.New()), runtimeenv.ModeLocal)
|
||||
}
|
||||
|
||||
func sendKey(t *testing.T, m model, key tea.KeyType) model {
|
||||
t.Helper()
|
||||
|
||||
next, _ := m.Update(tea.KeyMsg{Type: key})
|
||||
return next.(model)
|
||||
}
|
||||
|
||||
func TestUpdateMainMenuCursorNavigation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
m = sendKey(t, m, tea.KeyDown)
|
||||
if m.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after down", m.cursor)
|
||||
}
|
||||
|
||||
m = sendKey(t, m, tea.KeyDown)
|
||||
if m.cursor != 2 {
|
||||
t.Fatalf("cursor=%d want 2 after second down", m.cursor)
|
||||
}
|
||||
|
||||
m = sendKey(t, m, tea.KeyUp)
|
||||
if m.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after up", m.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMainMenuEnterActions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
wantScreen screen
|
||||
wantBusy bool
|
||||
wantCmd bool
|
||||
}{
|
||||
{name: "network", cursor: 0, wantScreen: screenNetwork},
|
||||
{name: "services", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
{name: "acceptance", cursor: 2, wantScreen: screenAcceptance},
|
||||
{name: "run audit", cursor: 3, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
{name: "export", cursor: 4, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.busy != test.wantBusy {
|
||||
t.Fatalf("busy=%v want %v", got.busy, test.wantBusy)
|
||||
}
|
||||
if (cmd != nil) != test.wantCmd {
|
||||
t.Fatalf("cmd present=%v want %v", cmd != nil, test.wantCmd)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateConfirmCancelViaKeys(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
m.pendingAction = actionRunNvidiaSAT
|
||||
|
||||
next, _ := m.Update(tea.KeyMsg{Type: tea.KeyRight})
|
||||
got := next.(model)
|
||||
if got.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after right", got.cursor)
|
||||
}
|
||||
|
||||
next, _ = got.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got = next.(model)
|
||||
if got.screen != screenAcceptance {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenAcceptance)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0 after cancel", got.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainMenuSimpleTransitions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
wantScreen screen
|
||||
}{
|
||||
{name: "network", cursor: 0, wantScreen: screenNetwork},
|
||||
{name: "acceptance", cursor: 2, wantScreen: screenAcceptance},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.handleMainMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatalf("expected nil cmd for %s", test.name)
|
||||
}
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainMenuAsyncActionsSetBusy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
}{
|
||||
{name: "services", cursor: 1},
|
||||
{name: "run audit", cursor: 3},
|
||||
{name: "export", cursor: 4},
|
||||
{name: "check tools", cursor: 5},
|
||||
{name: "health summary", cursor: 6},
|
||||
{name: "log tail", cursor: 7},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.handleMainMenu()
|
||||
got := next.(model)
|
||||
|
||||
if !got.busy {
|
||||
t.Fatalf("busy=false for %s", test.name)
|
||||
}
|
||||
if cmd == nil {
|
||||
t.Fatalf("expected async cmd for %s", test.name)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainViewIncludesBanner(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.banner = "System: Test Server | S/N ABC123\nIP: 10.0.0.10"
|
||||
|
||||
view := m.View()
|
||||
if !strings.Contains(view, "System: Test Server | S/N ABC123") {
|
||||
t.Fatalf("view missing system banner:\n%s", view)
|
||||
}
|
||||
if !strings.Contains(view, "IP: 10.0.0.10") {
|
||||
t.Fatalf("view missing ip banner:\n%s", view)
|
||||
}
|
||||
if !strings.Contains(view, "Select action") {
|
||||
t.Fatalf("view missing menu subtitle:\n%s", view)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEscapeNavigation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
screen screen
|
||||
wantScreen screen
|
||||
}{
|
||||
{name: "network to main", screen: screenNetwork, wantScreen: screenMain},
|
||||
{name: "services to main", screen: screenServices, wantScreen: screenMain},
|
||||
{name: "acceptance to main", screen: screenAcceptance, wantScreen: screenMain},
|
||||
{name: "service action to services", screen: screenServiceAction, wantScreen: screenServices},
|
||||
{name: "export targets to main", screen: screenExportTargets, wantScreen: screenMain},
|
||||
{name: "interface pick to network", screen: screenInterfacePick, wantScreen: screenNetwork},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = test.screen
|
||||
m.cursor = 3
|
||||
|
||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutputScreenReturnsToPreviousScreen(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenOutput
|
||||
m.prevScreen = screenNetwork
|
||||
m.title = "title"
|
||||
m.body = "body"
|
||||
|
||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenNetwork {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenNetwork)
|
||||
}
|
||||
if got.title != "" || got.body != "" {
|
||||
t.Fatalf("expected output state cleared, got title=%q body=%q", got.title, got.body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAcceptanceConfirmFlow(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = 0
|
||||
|
||||
next, cmd := m.handleAcceptanceMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
||||
}
|
||||
if got.pendingAction != actionRunNvidiaSAT {
|
||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunNvidiaSAT)
|
||||
}
|
||||
|
||||
next, _ = got.updateConfirm(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got = next.(model)
|
||||
if got.screen != screenAcceptance {
|
||||
t.Fatalf("screen after esc=%q want %q", got.screen, screenAcceptance)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAcceptanceMenuMapsNewTargets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
cursor int
|
||||
want actionKind
|
||||
}{
|
||||
{cursor: 0, want: actionRunNvidiaSAT},
|
||||
{cursor: 1, want: actionRunMemorySAT},
|
||||
{cursor: 2, want: actionRunStorageSAT},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
m := newTestModel()
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, _ := m.handleAcceptanceMenu()
|
||||
got := next.(model)
|
||||
if got.pendingAction != test.want {
|
||||
t.Fatalf("cursor=%d pendingAction=%q want %q", test.cursor, got.pendingAction, test.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportTargetSelectionOpensConfirm(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenExportTargets
|
||||
m.targets = []platform.RemovableTarget{{Device: "/dev/sdb1", FSType: "vfat", Size: "16G"}}
|
||||
|
||||
next, cmd := m.handleExportTargetsMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
||||
}
|
||||
if got.pendingAction != actionExportAudit {
|
||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionExportAudit)
|
||||
}
|
||||
if got.selectedTarget == nil || got.selectedTarget.Device != "/dev/sdb1" {
|
||||
t.Fatalf("selectedTarget=%+v want /dev/sdb1", got.selectedTarget)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInterfacePickStaticIPv4OpensForm(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.pendingAction = actionStaticIPv4
|
||||
m.interfaces = []platform.InterfaceInfo{{Name: "eth0"}}
|
||||
|
||||
next, cmd := m.handleInterfacePickMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenStaticForm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenStaticForm)
|
||||
}
|
||||
if got.selectedIface != "eth0" {
|
||||
t.Fatalf("selectedIface=%q want eth0", got.selectedIface)
|
||||
}
|
||||
if len(got.formFields) != 4 {
|
||||
t.Fatalf("len(formFields)=%d want 4", len(got.formFields))
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgUsesExplicitBackScreen(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "done", body: "ok", back: screenNetwork})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenOutput {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
|
||||
}
|
||||
if got.prevScreen != screenNetwork {
|
||||
t.Fatalf("prevScreen=%q want %q", got.prevScreen, screenNetwork)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfirmCancelTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
m.pendingAction = actionExportAudit
|
||||
if got := m.confirmCancelTarget(); got != screenExportTargets {
|
||||
t.Fatalf("export cancel target=%q want %q", got, screenExportTargets)
|
||||
}
|
||||
|
||||
m.pendingAction = actionRunNvidiaSAT
|
||||
if got := m.confirmCancelTarget(); got != screenAcceptance {
|
||||
t.Fatalf("sat cancel target=%q want %q", got, screenAcceptance)
|
||||
}
|
||||
|
||||
m.pendingAction = actionNone
|
||||
if got := m.confirmCancelTarget(); got != screenMain {
|
||||
t.Fatalf("default cancel target=%q want %q", got, screenMain)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewMainMenuRendersSelectedItem(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = 1
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"bee",
|
||||
"Select action",
|
||||
" Network",
|
||||
"> Services",
|
||||
"Acceptance tests",
|
||||
"[↑/↓] move [enter] select [esc] back [ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewBusyStateIsMinimal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
|
||||
view := m.View()
|
||||
want := "bee\n\nWorking...\n\n[ctrl+c] quit\n"
|
||||
if view != want {
|
||||
t.Fatalf("busy view mismatch\nwant:\n%s\ngot:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewBusyStateUsesBusyTitle(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
m.busyTitle = "Export audit"
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export audit",
|
||||
"Working...",
|
||||
"[ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenOutput
|
||||
m.title = "Run audit"
|
||||
m.body = "audit output: /var/log/bee-audit.json\n"
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Run audit",
|
||||
"audit output: /var/log/bee-audit.json",
|
||||
"[enter/esc] back [ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenExportTargets
|
||||
m.targets = []platform.RemovableTarget{
|
||||
{
|
||||
Device: "/dev/sdb1",
|
||||
FSType: "vfat",
|
||||
Size: "29G",
|
||||
Label: "BEEUSB",
|
||||
Mountpoint: "/media/bee",
|
||||
},
|
||||
}
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export audit",
|
||||
"Select removable filesystem",
|
||||
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewStaticFormRendersFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenStaticForm
|
||||
m.selectedIface = "enp1s0"
|
||||
m.formFields = []formField{
|
||||
{Label: "Address", Value: "192.0.2.10/24"},
|
||||
{Label: "Gateway", Value: "192.0.2.1"},
|
||||
{Label: "DNS", Value: "1.1.1.1"},
|
||||
}
|
||||
m.formIndex = 1
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Static IPv4: enp1s0",
|
||||
" Address: 192.0.2.10/24",
|
||||
"> Gateway: 192.0.2.1",
|
||||
" DNS: 1.1.1.1",
|
||||
"[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewConfirmScreenMatchesPendingExport(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
m.pendingAction = actionExportAudit
|
||||
m.selectedTarget = &platform.RemovableTarget{Device: "/dev/sdb1"}
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export audit",
|
||||
"Copy latest audit JSON to /dev/sdb1?",
|
||||
"> Confirm",
|
||||
" Cancel",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgClearsBusyAndPendingAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
m.busyTitle = "Export audit"
|
||||
m.pendingAction = actionExportAudit
|
||||
m.screen = screenConfirm
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "Export audit", body: "done", back: screenMain})
|
||||
got := next.(model)
|
||||
|
||||
if got.busy {
|
||||
t.Fatal("busy=true want false")
|
||||
}
|
||||
if got.busyTitle != "" {
|
||||
t.Fatalf("busyTitle=%q want empty", got.busyTitle)
|
||||
}
|
||||
if got.pendingAction != actionNone {
|
||||
t.Fatalf("pendingAction=%q want empty", got.pendingAction)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgErrorWithoutBodyFormatsCleanly(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "Export audit", err: assertErr("boom"), back: screenMain})
|
||||
got := next.(model)
|
||||
|
||||
if got.body != "ERROR: boom" {
|
||||
t.Fatalf("body=%q want %q", got.body, "ERROR: boom")
|
||||
}
|
||||
}
|
||||
|
||||
type assertErr string
|
||||
|
||||
func (e assertErr) Error() string { return string(e) }
|
||||
@@ -1,119 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
type screen string
|
||||
|
||||
const (
|
||||
screenMain screen = "main"
|
||||
screenNetwork screen = "network"
|
||||
screenInterfacePick screen = "interface_pick"
|
||||
screenServices screen = "services"
|
||||
screenServiceAction screen = "service_action"
|
||||
screenAcceptance screen = "acceptance"
|
||||
screenExportTargets screen = "export_targets"
|
||||
screenOutput screen = "output"
|
||||
screenStaticForm screen = "static_form"
|
||||
screenConfirm screen = "confirm"
|
||||
)
|
||||
|
||||
type actionKind string
|
||||
|
||||
const (
|
||||
actionNone actionKind = ""
|
||||
actionDHCPOne actionKind = "dhcp_one"
|
||||
actionStaticIPv4 actionKind = "static_ipv4"
|
||||
actionExportAudit actionKind = "export_audit"
|
||||
actionRunNvidiaSAT actionKind = "run_nvidia_sat"
|
||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
app *app.App
|
||||
runtimeMode runtimeenv.Mode
|
||||
|
||||
screen screen
|
||||
prevScreen screen
|
||||
cursor int
|
||||
busy bool
|
||||
busyTitle string
|
||||
title string
|
||||
body string
|
||||
banner string
|
||||
mainMenu []string
|
||||
networkMenu []string
|
||||
serviceMenu []string
|
||||
|
||||
services []string
|
||||
interfaces []platform.InterfaceInfo
|
||||
targets []platform.RemovableTarget
|
||||
selectedService string
|
||||
selectedIface string
|
||||
selectedTarget *platform.RemovableTarget
|
||||
pendingAction actionKind
|
||||
|
||||
formFields []formField
|
||||
formIndex int
|
||||
}
|
||||
|
||||
type formField struct {
|
||||
Label string
|
||||
Value string
|
||||
}
|
||||
|
||||
func Run(application *app.App, runtimeMode runtimeenv.Mode) error {
|
||||
options := []tea.ProgramOption{}
|
||||
if runtimeMode != runtimeenv.ModeLiveCD {
|
||||
options = append(options, tea.WithAltScreen())
|
||||
}
|
||||
program := tea.NewProgram(newModel(application, runtimeMode), options...)
|
||||
_, err := program.Run()
|
||||
return err
|
||||
}
|
||||
|
||||
func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
|
||||
return model{
|
||||
app: application,
|
||||
runtimeMode: runtimeMode,
|
||||
screen: screenMain,
|
||||
mainMenu: []string{
|
||||
"Network",
|
||||
"Services",
|
||||
"Acceptance tests",
|
||||
"Run audit",
|
||||
"Export audit",
|
||||
"Check tools",
|
||||
"Show health summary",
|
||||
"Show audit logs",
|
||||
"Exit",
|
||||
},
|
||||
networkMenu: []string{
|
||||
"Show status",
|
||||
"DHCP on all interfaces",
|
||||
"DHCP on one interface",
|
||||
"Set static IPv4",
|
||||
"Back",
|
||||
},
|
||||
serviceMenu: []string{
|
||||
"Status",
|
||||
"Restart",
|
||||
"Start",
|
||||
"Stop",
|
||||
"Back",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (m model) Init() tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
return bannerMsg{text: strings.TrimSpace(m.app.MainBanner())}
|
||||
}
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
switch msg := msg.(type) {
|
||||
case tea.KeyMsg:
|
||||
if m.busy {
|
||||
switch msg.String() {
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
default:
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
return m.updateKey(msg)
|
||||
case resultMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
m.title = msg.title
|
||||
if msg.err != nil {
|
||||
body := strings.TrimSpace(msg.body)
|
||||
if body == "" {
|
||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
||||
} else {
|
||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
||||
}
|
||||
} else {
|
||||
m.body = msg.body
|
||||
}
|
||||
m.pendingAction = actionNone
|
||||
if msg.back != "" {
|
||||
m.prevScreen = msg.back
|
||||
} else {
|
||||
m.prevScreen = m.screen
|
||||
}
|
||||
m.screen = screenOutput
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case servicesMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "Services"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.services = msg.services
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case interfacesMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "interfaces"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.interfaces = msg.ifaces
|
||||
m.screen = screenInterfacePick
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case exportTargetsMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "export"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.targets = msg.targets
|
||||
m.screen = screenExportTargets
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case bannerMsg:
|
||||
m.banner = strings.TrimSpace(msg.text)
|
||||
return m, nil
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch m.screen {
|
||||
case screenMain:
|
||||
return m.updateMenu(msg, len(m.mainMenu), m.handleMainMenu)
|
||||
case screenNetwork:
|
||||
return m.updateMenu(msg, len(m.networkMenu), m.handleNetworkMenu)
|
||||
case screenServices:
|
||||
return m.updateMenu(msg, len(m.services), m.handleServicesMenu)
|
||||
case screenServiceAction:
|
||||
return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu)
|
||||
case screenAcceptance:
|
||||
return m.updateMenu(msg, 4, m.handleAcceptanceMenu)
|
||||
case screenExportTargets:
|
||||
return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
|
||||
case screenInterfacePick:
|
||||
return m.updateMenu(msg, len(m.interfaces), m.handleInterfacePickMenu)
|
||||
case screenOutput:
|
||||
switch msg.String() {
|
||||
case "esc", "enter", "q":
|
||||
m.screen = m.prevScreen
|
||||
m.body = ""
|
||||
m.title = ""
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
case screenStaticForm:
|
||||
return m.updateStaticForm(msg)
|
||||
case screenConfirm:
|
||||
return m.updateConfirm(msg)
|
||||
}
|
||||
|
||||
if msg.String() == "ctrl+c" {
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateMenu(msg tea.KeyMsg, size int, onEnter func() (tea.Model, tea.Cmd)) (tea.Model, tea.Cmd) {
|
||||
if size == 0 {
|
||||
size = 1
|
||||
}
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.cursor > 0 {
|
||||
m.cursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.cursor < size-1 {
|
||||
m.cursor++
|
||||
}
|
||||
case "enter":
|
||||
return onEnter()
|
||||
case "esc":
|
||||
switch m.screen {
|
||||
case screenNetwork, screenServices, screenAcceptance:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
case screenServiceAction:
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
case screenExportTargets:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
case screenInterfacePick:
|
||||
m.screen = screenNetwork
|
||||
m.cursor = 0
|
||||
}
|
||||
case "q", "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,169 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) View() string {
|
||||
if m.busy {
|
||||
title := "bee"
|
||||
if m.busyTitle != "" {
|
||||
title = m.busyTitle
|
||||
}
|
||||
return fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
|
||||
}
|
||||
switch m.screen {
|
||||
case screenMain:
|
||||
return renderMainMenu("bee", m.banner, "Select action", m.mainMenu, m.cursor)
|
||||
case screenNetwork:
|
||||
return renderMenu("Network", "Select action", m.networkMenu, m.cursor)
|
||||
case screenServices:
|
||||
return renderMenu("Services", "Select service", m.services, m.cursor)
|
||||
case screenServiceAction:
|
||||
items := make([]string, len(m.serviceMenu))
|
||||
copy(items, m.serviceMenu)
|
||||
return renderMenu("Service: "+m.selectedService, "Select action", items, m.cursor)
|
||||
case screenAcceptance:
|
||||
return renderMenu("Acceptance tests", "Select action", []string{"Run NVIDIA command pack", "Run memory test", "Run storage diagnostic pack", "Back"}, m.cursor)
|
||||
case screenExportTargets:
|
||||
return renderMenu("Export audit", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
|
||||
case screenInterfacePick:
|
||||
return renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
|
||||
case screenStaticForm:
|
||||
return renderForm("Static IPv4: "+m.selectedIface, m.formFields, m.formIndex)
|
||||
case screenConfirm:
|
||||
title, body := m.confirmBody()
|
||||
return renderConfirm(title, body, m.cursor)
|
||||
case screenOutput:
|
||||
return fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||
default:
|
||||
return "bee\n"
|
||||
}
|
||||
}
|
||||
|
||||
func (m model) confirmBody() (string, string) {
|
||||
switch m.pendingAction {
|
||||
case actionExportAudit:
|
||||
if m.selectedTarget == nil {
|
||||
return "Export audit", "No target selected"
|
||||
}
|
||||
return "Export audit", fmt.Sprintf("Copy latest audit JSON to %s?", m.selectedTarget.Device)
|
||||
case actionRunNvidiaSAT:
|
||||
return "NVIDIA SAT", "Run NVIDIA acceptance command pack?"
|
||||
case actionRunMemorySAT:
|
||||
return "Memory SAT", "Run runtime memory test with memtester?"
|
||||
case actionRunStorageSAT:
|
||||
return "Storage SAT", "Run storage diagnostic pack and start short self-tests where supported?"
|
||||
default:
|
||||
return "Confirm", "Proceed?"
|
||||
}
|
||||
}
|
||||
|
||||
func renderTargetItems(targets []platform.RemovableTarget) []string {
|
||||
items := make([]string, 0, len(targets))
|
||||
for _, target := range targets {
|
||||
desc := fmt.Sprintf("%s [%s %s]", target.Device, target.FSType, target.Size)
|
||||
if target.Label != "" {
|
||||
desc += " label=" + target.Label
|
||||
}
|
||||
if target.Mountpoint != "" {
|
||||
desc += " mounted=" + target.Mountpoint
|
||||
}
|
||||
items = append(items, desc)
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func renderInterfaceItems(interfaces []platform.InterfaceInfo) []string {
|
||||
items := make([]string, 0, len(interfaces))
|
||||
for _, iface := range interfaces {
|
||||
label := iface.Name
|
||||
if len(iface.IPv4) > 0 {
|
||||
label += " [" + strings.Join(iface.IPv4, ", ") + "]"
|
||||
}
|
||||
items = append(items, label)
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func renderMenu(title, subtitle string, items []string, cursor int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n%s\n\n", title, subtitle)
|
||||
if len(items) == 0 {
|
||||
body.WriteString("(no items)\n")
|
||||
} else {
|
||||
for i, item := range items {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s\n", prefix, item)
|
||||
}
|
||||
}
|
||||
body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderMainMenu(title, banner, subtitle string, items []string, cursor int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n", title)
|
||||
if banner != "" {
|
||||
body.WriteString(strings.TrimSpace(banner))
|
||||
body.WriteString("\n\n")
|
||||
}
|
||||
body.WriteString(subtitle)
|
||||
body.WriteString("\n\n")
|
||||
if len(items) == 0 {
|
||||
body.WriteString("(no items)\n")
|
||||
} else {
|
||||
for i, item := range items {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s\n", prefix, item)
|
||||
}
|
||||
}
|
||||
body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderForm(title string, fields []formField, idx int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n", title)
|
||||
for i, field := range fields {
|
||||
prefix := " "
|
||||
if i == idx {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s: %s\n", prefix, field.Label, field.Value)
|
||||
}
|
||||
body.WriteString("\n[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderConfirm(title, body string, cursor int) string {
|
||||
options := []string{"Confirm", "Cancel"}
|
||||
var out strings.Builder
|
||||
fmt.Fprintf(&out, "%s\n\n%s\n\n", title, body)
|
||||
for i, option := range options {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&out, "%s%s\n", prefix, option)
|
||||
}
|
||||
out.WriteString("\n[←/→/↑/↓] move [enter] select [esc] cancel\n")
|
||||
return out.String()
|
||||
}
|
||||
|
||||
func resultCmd(title, body string, err error, back screen) tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
return resultMsg{title: title, body: body, err: err, back: back}
|
||||
}
|
||||
}
|
||||
1549
audit/internal/webui/api.go
Normal file
1549
audit/internal/webui/api.go
Normal file
File diff suppressed because it is too large
Load Diff
299
audit/internal/webui/api_test.go
Normal file
299
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,299 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||
req.ContentLength = -1
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-perf" {
|
||||
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
|
||||
}
|
||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||
}
|
||||
if task.params.RunNCCL {
|
||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var resp taskRunResponse
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if len(resp.TaskIDs) != 2 {
|
||||
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
|
||||
// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-bench-power" {
|
||||
t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
if task.params.RampTotal != 3 {
|
||||
t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||
got := []int{
|
||||
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||
defaultTaskPriority("audit", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
|
||||
defaultTaskPriority("nvidia-bench-power", taskParams{}),
|
||||
}
|
||||
want := []int{
|
||||
taskPriorityInstallToRAM,
|
||||
taskPriorityAudit,
|
||||
taskPriorityValidate,
|
||||
taskPriorityValidateStress,
|
||||
taskPriorityBurn,
|
||||
taskPriorityBenchmark,
|
||||
taskPriorityBenchmark,
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
|
||||
t.Fatalf("priority order=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_A", RPM: 4200},
|
||||
{Name: "FAN_B", RPM: 5100},
|
||||
})
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_B", RPM: 5200},
|
||||
})
|
||||
|
||||
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||
t.Fatalf("fanNames=%v", h.fanNames)
|
||||
}
|
||||
aVals, _ := h.ringFans[0].snapshot()
|
||||
bVals, _ := h.ringFans[1].snapshot()
|
||||
if len(aVals) != 2 || len(bVals) != 2 {
|
||||
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||
}
|
||||
if aVals[1] != 4200 {
|
||||
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||
}
|
||||
if bVals[1] != 5200 {
|
||||
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||
}
|
||||
}
|
||||
992
audit/internal/webui/charts_svg.go
Normal file
992
audit/internal/webui/charts_svg.go
Normal file
@@ -0,0 +1,992 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type chartTimelineSegment struct {
|
||||
Start time.Time
|
||||
End time.Time
|
||||
Active bool
|
||||
}
|
||||
|
||||
type chartScale struct {
|
||||
Min float64
|
||||
Max float64
|
||||
Ticks []float64
|
||||
}
|
||||
|
||||
type chartLayout struct {
|
||||
Width int
|
||||
Height int
|
||||
PlotLeft int
|
||||
PlotRight int
|
||||
PlotTop int
|
||||
PlotBottom int
|
||||
}
|
||||
|
||||
type metricChartSeries struct {
|
||||
Name string
|
||||
AxisTitle string
|
||||
Color string
|
||||
Values []float64
|
||||
}
|
||||
|
||||
var metricChartPalette = []string{
|
||||
"#5794f2",
|
||||
"#73bf69",
|
||||
"#f2cc0c",
|
||||
"#ff9830",
|
||||
"#f2495c",
|
||||
"#b877d9",
|
||||
"#56d2f7",
|
||||
"#8ab8ff",
|
||||
"#9adf8f",
|
||||
"#ffbe5c",
|
||||
}
|
||||
|
||||
var gpuLabelCache struct {
|
||||
mu sync.Mutex
|
||||
loadedAt time.Time
|
||||
byIndex map[int]string
|
||||
}
|
||||
|
||||
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{time.Time{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points (one per pixel) before building SVG.
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
statsLabel := chartStatsLabel(datasets)
|
||||
|
||||
legendItems := []metricChartSeries{}
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
values := make([]float64, pointCount)
|
||||
if i < len(datasets) {
|
||||
copy(values, coalesceDataset(datasets[i], pointCount))
|
||||
}
|
||||
legendItems = append(legendItems, metricChartSeries{
|
||||
Name: name,
|
||||
Color: color,
|
||||
Values: values,
|
||||
})
|
||||
}
|
||||
|
||||
scale := singleAxisChartScale(datasets, yMin, yMax)
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
for _, item := range legendItems {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
|
||||
}
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||
if temp == nil && power == nil && coreClock == nil {
|
||||
return nil, false, nil
|
||||
}
|
||||
labels := sampleTimeLabels(samples)
|
||||
times := sampleTimes(samples)
|
||||
svg, err := drawGPUOverviewChartSVG(
|
||||
gpuDisplayLabel(idx)+" Overview",
|
||||
labels,
|
||||
times,
|
||||
[]metricChartSeries{
|
||||
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
|
||||
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
|
||||
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
|
||||
},
|
||||
timeline,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
return svg, true, nil
|
||||
}
|
||||
|
||||
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
if len(series) != 3 {
|
||||
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
|
||||
}
|
||||
const (
|
||||
width = 1400
|
||||
height = 840
|
||||
plotLeft = 180
|
||||
plotRight = 1220
|
||||
plotTop = 96
|
||||
plotBottom = 660
|
||||
)
|
||||
const (
|
||||
leftOuterAxis = 72
|
||||
leftInnerAxis = 132
|
||||
rightInnerAxis = 1268
|
||||
)
|
||||
layout := chartLayout{
|
||||
Width: width,
|
||||
Height: height,
|
||||
PlotLeft: plotLeft,
|
||||
PlotRight: plotRight,
|
||||
PlotTop: plotTop,
|
||||
PlotBottom: plotBottom,
|
||||
}
|
||||
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{time.Time{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range series {
|
||||
if len(series[i].Values) == 0 {
|
||||
series[i].Values = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points before building SVG.
|
||||
{
|
||||
datasets := make([][]float64, len(series))
|
||||
for i := range series {
|
||||
datasets[i] = series[i].Values
|
||||
}
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
for i := range series {
|
||||
series[i].Values = datasets[i]
|
||||
}
|
||||
}
|
||||
|
||||
scales := make([]chartScale, len(series))
|
||||
for i := range series {
|
||||
min, max := chartSeriesBounds(series[i].Values)
|
||||
ticks := chartNiceTicks(min, max, 8)
|
||||
scales[i] = chartScale{
|
||||
Min: ticks[0],
|
||||
Max: ticks[len(ticks)-1],
|
||||
Ticks: ticks,
|
||||
}
|
||||
}
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, width, height)
|
||||
writeChartFrame(&b, title, "", width, height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scales[0])
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
|
||||
for i, axisLineX := range axisX {
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
|
||||
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
|
||||
for _, tick := range scales[i].Ticks {
|
||||
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
|
||||
label := sanitizeChartText(chartYAxisNumber(tick))
|
||||
if i < 2 {
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, y, axisLineX+6, y, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||
axisLineX-8, y, series[i].Color, label)
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, y, axisLineX-6, y, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||
axisLineX+8, y, series[i].Color, label)
|
||||
}
|
||||
}
|
||||
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
for i := range series {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
|
||||
}
|
||||
writeLegend(&b, layout, series)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
|
||||
if len(samples) == 0 {
|
||||
return nil
|
||||
}
|
||||
times := sampleTimes(samples)
|
||||
start, end := chartTimeBounds(times)
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil
|
||||
}
|
||||
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
|
||||
}
|
||||
|
||||
func snapshotTaskHistory() []Task {
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
out := make([]Task, len(globalQueue.tasks))
|
||||
for i, t := range globalQueue.tasks {
|
||||
out[i] = *t
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil
|
||||
}
|
||||
if end.Before(start) {
|
||||
start, end = end, start
|
||||
}
|
||||
type interval struct {
|
||||
start time.Time
|
||||
end time.Time
|
||||
}
|
||||
active := make([]interval, 0, len(tasks))
|
||||
for _, task := range tasks {
|
||||
if task.StartedAt == nil {
|
||||
continue
|
||||
}
|
||||
intervalStart := task.StartedAt.UTC()
|
||||
intervalEnd := now.UTC()
|
||||
if task.DoneAt != nil {
|
||||
intervalEnd = task.DoneAt.UTC()
|
||||
}
|
||||
if !intervalEnd.After(intervalStart) {
|
||||
continue
|
||||
}
|
||||
if intervalEnd.Before(start) || intervalStart.After(end) {
|
||||
continue
|
||||
}
|
||||
if intervalStart.Before(start) {
|
||||
intervalStart = start
|
||||
}
|
||||
if intervalEnd.After(end) {
|
||||
intervalEnd = end
|
||||
}
|
||||
active = append(active, interval{start: intervalStart, end: intervalEnd})
|
||||
}
|
||||
sort.Slice(active, func(i, j int) bool {
|
||||
if active[i].start.Equal(active[j].start) {
|
||||
return active[i].end.Before(active[j].end)
|
||||
}
|
||||
return active[i].start.Before(active[j].start)
|
||||
})
|
||||
merged := make([]interval, 0, len(active))
|
||||
for _, span := range active {
|
||||
if len(merged) == 0 {
|
||||
merged = append(merged, span)
|
||||
continue
|
||||
}
|
||||
last := &merged[len(merged)-1]
|
||||
if !span.start.After(last.end) {
|
||||
if span.end.After(last.end) {
|
||||
last.end = span.end
|
||||
}
|
||||
continue
|
||||
}
|
||||
merged = append(merged, span)
|
||||
}
|
||||
|
||||
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
|
||||
cursor := start
|
||||
for _, span := range merged {
|
||||
if span.start.After(cursor) {
|
||||
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
|
||||
}
|
||||
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
|
||||
cursor = span.end
|
||||
}
|
||||
if cursor.Before(end) {
|
||||
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
|
||||
}
|
||||
if len(segments) == 0 {
|
||||
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
|
||||
}
|
||||
return segments
|
||||
}
|
||||
|
||||
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
|
||||
times := make([]time.Time, 0, len(samples))
|
||||
for _, sample := range samples {
|
||||
times = append(times, sample.Timestamp)
|
||||
}
|
||||
return times
|
||||
}
|
||||
|
||||
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
|
||||
min, max := 0.0, 1.0
|
||||
if yMin != nil && yMax != nil {
|
||||
min, max = *yMin, *yMax
|
||||
} else {
|
||||
min, max = chartSeriesBounds(flattenDatasets(datasets))
|
||||
if yMin != nil {
|
||||
min = *yMin
|
||||
}
|
||||
if yMax != nil {
|
||||
max = *yMax
|
||||
}
|
||||
}
|
||||
ticks := chartNiceTicks(min, max, 8)
|
||||
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
|
||||
}
|
||||
|
||||
func flattenDatasets(datasets [][]float64) []float64 {
|
||||
total := 0
|
||||
for _, ds := range datasets {
|
||||
total += len(ds)
|
||||
}
|
||||
out := make([]float64, 0, total)
|
||||
for _, ds := range datasets {
|
||||
out = append(out, ds...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
|
||||
legendRows := 0
|
||||
if chartLegendVisible(seriesCount) && seriesCount > 0 {
|
||||
cols := 4
|
||||
if seriesCount < cols {
|
||||
cols = seriesCount
|
||||
}
|
||||
legendRows = (seriesCount + cols - 1) / cols
|
||||
}
|
||||
legendHeight := 0
|
||||
if legendRows > 0 {
|
||||
legendHeight = legendRows*24 + 24
|
||||
}
|
||||
return chartLayout{
|
||||
Width: 1400,
|
||||
Height: canvasHeight,
|
||||
PlotLeft: 96,
|
||||
PlotRight: 1352,
|
||||
PlotTop: 72,
|
||||
PlotBottom: canvasHeight - 60 - legendHeight,
|
||||
}
|
||||
}
|
||||
|
||||
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
|
||||
if len(times) == 0 {
|
||||
return time.Time{}, time.Time{}
|
||||
}
|
||||
start := times[0].UTC()
|
||||
end := start
|
||||
for _, ts := range times[1:] {
|
||||
t := ts.UTC()
|
||||
if t.Before(start) {
|
||||
start = t
|
||||
}
|
||||
if t.After(end) {
|
||||
end = t
|
||||
}
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||
if count <= 0 {
|
||||
return nil
|
||||
}
|
||||
if len(times) == count {
|
||||
return times
|
||||
}
|
||||
if len(times) == 1 {
|
||||
out := make([]time.Time, count)
|
||||
for i := range out {
|
||||
out[i] = times[0].Add(time.Duration(i) * time.Minute)
|
||||
}
|
||||
return out
|
||||
}
|
||||
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
|
||||
out := make([]time.Time, count)
|
||||
for i := range out {
|
||||
out[i] = base.Add(time.Duration(i) * time.Minute)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||
// power charts where the filled area of each PSU shows its individual
|
||||
// contribution and the total height equals the combined draw.
|
||||
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
// Build cumulative sums per time point.
|
||||
cumulative := make([][]float64, len(datasets)+1)
|
||||
for i := range cumulative {
|
||||
cumulative[i] = make([]float64, pointCount)
|
||||
}
|
||||
for i, ds := range datasets {
|
||||
for j, v := range ds {
|
||||
cumulative[i+1][j] = cumulative[i][j] + v
|
||||
}
|
||||
}
|
||||
|
||||
// Scale is based on the total (top cumulative row).
|
||||
total := cumulative[len(cumulative)-1]
|
||||
yMin := floatPtr(0)
|
||||
if yMax == nil {
|
||||
yMax = autoMax120(total)
|
||||
}
|
||||
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||
|
||||
legendItems := make([]metricChartSeries, len(datasets))
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||
}
|
||||
|
||||
// Stats label from totals.
|
||||
statsLabel := chartStatsLabel([][]float64{total})
|
||||
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
|
||||
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
// Draw border polylines on top.
|
||||
for i := len(datasets) - 1; i >= 0; i-- {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||
}
|
||||
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||
// (baseline and top), using the given color at 55% opacity.
|
||||
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||
n := len(top)
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
if len(baseline) < n {
|
||||
baseline = make([]float64, n)
|
||||
}
|
||||
|
||||
// Forward path along top values, then backward along baseline values.
|
||||
var points strings.Builder
|
||||
for i := 0; i < n; i++ {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
if i > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
for i := n - 1; i >= 0; i-- {
|
||||
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteByte(' ')
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||
}
|
||||
|
||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||
}
|
||||
|
||||
func writeSVGClose(b *strings.Builder) {
|
||||
b.WriteString("</svg>\n")
|
||||
}
|
||||
|
||||
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(title))
|
||||
if strings.TrimSpace(subtitle) != "" {
|
||||
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(subtitle))
|
||||
}
|
||||
}
|
||||
|
||||
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
|
||||
}
|
||||
|
||||
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
|
||||
for _, tick := range scale.Ticks {
|
||||
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||
layout.PlotLeft, y, layout.PlotRight, y)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
|
||||
if pointCount <= 0 {
|
||||
return
|
||||
}
|
||||
start, end := chartTimeBounds(times)
|
||||
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
|
||||
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||
ts := chartPointTime(times, idx)
|
||||
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||
x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
|
||||
for _, tick := range scale.Ticks {
|
||||
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, y, layout.PlotLeft-6, y)
|
||||
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
|
||||
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
|
||||
}
|
||||
}
|
||||
|
||||
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
|
||||
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
label := ""
|
||||
if idx < len(labels) {
|
||||
label = labels[idx]
|
||||
}
|
||||
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
|
||||
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
|
||||
}
|
||||
|
||||
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
|
||||
if len(values) == 0 {
|
||||
return
|
||||
}
|
||||
var points strings.Builder
|
||||
for idx, value := range values {
|
||||
if idx > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
|
||||
points.String(), color)
|
||||
if len(values) == 1 {
|
||||
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||
return
|
||||
}
|
||||
peakIdx := 0
|
||||
peakValue := values[0]
|
||||
for idx, value := range values[1:] {
|
||||
if value >= peakValue {
|
||||
peakIdx = idx + 1
|
||||
peakValue = value
|
||||
}
|
||||
}
|
||||
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||
}
|
||||
|
||||
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||
if !chartLegendVisible(len(series)) || len(series) == 0 {
|
||||
return
|
||||
}
|
||||
cols := 4
|
||||
if len(series) < cols {
|
||||
cols = len(series)
|
||||
}
|
||||
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
|
||||
baseY := layout.PlotBottom + 74
|
||||
for i, item := range series {
|
||||
row := i / cols
|
||||
col := i % cols
|
||||
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
|
||||
y := float64(baseY + row*24)
|
||||
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
|
||||
x, y, x+28, y, item.Color)
|
||||
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
|
||||
x+38, y+4, sanitizeChartText(item.Name))
|
||||
}
|
||||
}
|
||||
|
||||
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||
if len(segments) == 0 {
|
||||
return
|
||||
}
|
||||
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
|
||||
for _, segment := range segments {
|
||||
if segment.Active || !segment.End.After(segment.Start) {
|
||||
continue
|
||||
}
|
||||
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
|
||||
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||
if len(segments) == 0 {
|
||||
return
|
||||
}
|
||||
seen := map[int]bool{}
|
||||
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
|
||||
for i, segment := range segments {
|
||||
if i > 0 {
|
||||
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||
if !seen[x] {
|
||||
seen[x] = true
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
}
|
||||
if i < len(segments)-1 {
|
||||
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||
if !seen[x] {
|
||||
seen[x] = true
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
}
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
// downsampleTimeSeries reduces the time series to at most maxPts points using
|
||||
// min-max bucketing. Each bucket contributes the index of its min and max value
|
||||
// (using the first full-length dataset as the reference series). All parallel
|
||||
// datasets are sampled at those same indices so all series stay aligned.
|
||||
// If len(times) <= maxPts the inputs are returned unchanged.
|
||||
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
|
||||
n := len(times)
|
||||
if n <= maxPts || maxPts <= 0 {
|
||||
return times, datasets
|
||||
}
|
||||
buckets := maxPts / 2
|
||||
if buckets < 1 {
|
||||
buckets = 1
|
||||
}
|
||||
// Use the first dataset that has the same length as times as the reference
|
||||
// for deciding which two indices to keep per bucket.
|
||||
var ref []float64
|
||||
for _, ds := range datasets {
|
||||
if len(ds) == n {
|
||||
ref = ds
|
||||
break
|
||||
}
|
||||
}
|
||||
selected := make([]int, 0, maxPts)
|
||||
bucketSize := float64(n) / float64(buckets)
|
||||
for b := 0; b < buckets; b++ {
|
||||
lo := int(math.Round(float64(b) * bucketSize))
|
||||
hi := int(math.Round(float64(b+1) * bucketSize))
|
||||
if hi > n {
|
||||
hi = n
|
||||
}
|
||||
if lo >= hi {
|
||||
continue
|
||||
}
|
||||
if ref == nil {
|
||||
selected = append(selected, lo)
|
||||
if hi-1 != lo {
|
||||
selected = append(selected, hi-1)
|
||||
}
|
||||
continue
|
||||
}
|
||||
minIdx, maxIdx := lo, lo
|
||||
for i := lo + 1; i < hi; i++ {
|
||||
if ref[i] < ref[minIdx] {
|
||||
minIdx = i
|
||||
}
|
||||
if ref[i] > ref[maxIdx] {
|
||||
maxIdx = i
|
||||
}
|
||||
}
|
||||
if minIdx <= maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
if maxIdx != minIdx {
|
||||
selected = append(selected, maxIdx)
|
||||
}
|
||||
} else {
|
||||
selected = append(selected, maxIdx)
|
||||
if minIdx != maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
}
|
||||
}
|
||||
}
|
||||
outTimes := make([]time.Time, len(selected))
|
||||
for i, idx := range selected {
|
||||
outTimes[i] = times[idx]
|
||||
}
|
||||
outDatasets := make([][]float64, len(datasets))
|
||||
for d, ds := range datasets {
|
||||
if len(ds) != n {
|
||||
outDatasets[d] = ds
|
||||
continue
|
||||
}
|
||||
out := make([]float64, len(selected))
|
||||
for i, idx := range selected {
|
||||
out[i] = ds[idx]
|
||||
}
|
||||
outDatasets[d] = out
|
||||
}
|
||||
return outTimes, outDatasets
|
||||
}
|
||||
|
||||
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||
if !end.After(start) {
|
||||
return float64(left+right) / 2
|
||||
}
|
||||
if ts.Before(start) {
|
||||
ts = start
|
||||
}
|
||||
if ts.After(end) {
|
||||
ts = end
|
||||
}
|
||||
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
|
||||
return float64(left) + ratio*float64(right-left)
|
||||
}
|
||||
|
||||
func chartPointTime(times []time.Time, idx int) time.Time {
|
||||
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
|
||||
return times[idx].UTC()
|
||||
}
|
||||
if len(times) > 0 && !times[0].IsZero() {
|
||||
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
|
||||
}
|
||||
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
|
||||
}
|
||||
|
||||
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
|
||||
if scale.Max <= scale.Min {
|
||||
return float64(plotTop+plotBottom) / 2
|
||||
}
|
||||
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
|
||||
}
|
||||
|
||||
func chartSeriesBounds(values []float64) (float64, float64) {
|
||||
if len(values) == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
min, max := values[0], values[0]
|
||||
for _, value := range values[1:] {
|
||||
if value < min {
|
||||
min = value
|
||||
}
|
||||
if value > max {
|
||||
max = value
|
||||
}
|
||||
}
|
||||
if min == max {
|
||||
if max == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
pad := math.Abs(max) * 0.1
|
||||
if pad == 0 {
|
||||
pad = 1
|
||||
}
|
||||
min -= pad
|
||||
max += pad
|
||||
}
|
||||
if min > 0 {
|
||||
pad := (max - min) * 0.2
|
||||
if pad == 0 {
|
||||
pad = max * 0.1
|
||||
}
|
||||
min -= pad
|
||||
if min < 0 {
|
||||
min = 0
|
||||
}
|
||||
max += pad
|
||||
}
|
||||
return min, max
|
||||
}
|
||||
|
||||
func chartNiceTicks(min, max float64, target int) []float64 {
|
||||
if min == max {
|
||||
max = min + 1
|
||||
}
|
||||
span := max - min
|
||||
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
|
||||
for _, factor := range []float64{1, 2, 5, 10} {
|
||||
if span/(factor*step) <= float64(target)*1.5 {
|
||||
step = factor * step
|
||||
break
|
||||
}
|
||||
}
|
||||
low := math.Floor(min/step) * step
|
||||
high := math.Ceil(max/step) * step
|
||||
var ticks []float64
|
||||
for value := low; value <= high+step*0.001; value += step {
|
||||
ticks = append(ticks, math.Round(value*1e9)/1e9)
|
||||
}
|
||||
return ticks
|
||||
}
|
||||
|
||||
func valueClamp(value float64, scale chartScale) float64 {
|
||||
if value < scale.Min {
|
||||
return scale.Min
|
||||
}
|
||||
if value > scale.Max {
|
||||
return scale.Max
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func chartStatsLabel(datasets [][]float64) string {
|
||||
mn, avg, mx := globalStats(datasets)
|
||||
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf("min %s avg %s max %s",
|
||||
chartLegendNumber(mn),
|
||||
chartLegendNumber(avg),
|
||||
chartLegendNumber(mx),
|
||||
)
|
||||
}
|
||||
|
||||
func gpuDisplayLabel(idx int) string {
|
||||
if name := gpuModelNameByIndex(idx); name != "" {
|
||||
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||
}
|
||||
return fmt.Sprintf("GPU %d", idx)
|
||||
}
|
||||
|
||||
func gpuModelNameByIndex(idx int) string {
|
||||
now := time.Now()
|
||||
gpuLabelCache.mu.Lock()
|
||||
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||
gpuLabelCache.loadedAt = now
|
||||
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||
}
|
||||
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||
gpuLabelCache.mu.Unlock()
|
||||
return name
|
||||
}
|
||||
|
||||
func loadGPUModelNames() map[int]string {
|
||||
out := map[int]string{}
|
||||
gpus, err := platform.New().ListNvidiaGPUs()
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
for _, gpu := range gpus {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name != "" {
|
||||
out[gpu.Index] = name
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
195
audit/internal/webui/jobs.go
Normal file
195
audit/internal/webui/jobs.go
Normal file
@@ -0,0 +1,195 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// jobState holds the output lines and completion status of an async job.
|
||||
type jobState struct {
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logBuf *bufio.Writer
|
||||
}
|
||||
|
||||
// readTaskLogFile reads a task log, refusing files over 50 MB.
|
||||
func readTaskLogFile(path string) ([]byte, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(len(data)) > 50<<20 {
|
||||
return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
func (j *jobState) abort() bool {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
if j.done || j.cancel == nil {
|
||||
return false
|
||||
}
|
||||
j.cancel()
|
||||
return true
|
||||
}
|
||||
|
||||
func (j *jobState) append(line string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
j.writeLogLineLocked(line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
select {
|
||||
case ch <- line:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
|
||||
// Must be called with j.mu held. Uses a buffered writer kept open for the task
|
||||
// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
|
||||
func (j *jobState) writeLogLineLocked(line string) {
|
||||
if j.logFile == nil {
|
||||
f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
j.logFile = f
|
||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||
}
|
||||
_, _ = j.logBuf.WriteString(line + "\n")
|
||||
}
|
||||
|
||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||
func (j *jobState) closeLog() {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
if j.logBuf != nil {
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
if j.logFile != nil {
|
||||
_ = j.logFile.Close()
|
||||
j.logFile = nil
|
||||
j.logBuf = nil
|
||||
}
|
||||
}
|
||||
|
||||
func (j *jobState) finish(errMsg string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.done = true
|
||||
j.err = errMsg
|
||||
for _, ch := range j.subs {
|
||||
close(ch)
|
||||
}
|
||||
j.subs = nil
|
||||
}
|
||||
|
||||
// subscribe returns a channel that receives all future lines.
|
||||
// Existing lines are returned first, then the channel streams new ones.
|
||||
func (j *jobState) subscribe() ([]string, <-chan string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
existing := make([]string, len(j.lines))
|
||||
copy(existing, j.lines)
|
||||
if j.done {
|
||||
return existing, nil
|
||||
}
|
||||
ch := make(chan string, 256)
|
||||
j.subs = append(j.subs, ch)
|
||||
return existing, ch
|
||||
}
|
||||
|
||||
// jobManager manages async jobs identified by string IDs.
|
||||
type jobManager struct {
|
||||
mu sync.Mutex
|
||||
jobs map[string]*jobState
|
||||
}
|
||||
|
||||
var globalJobs = &jobManager{jobs: make(map[string]*jobState)}
|
||||
|
||||
func (m *jobManager) create(id string) *jobState {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
j := &jobState{}
|
||||
m.jobs[id] = j
|
||||
// Schedule cleanup after 30 minutes
|
||||
goRecoverOnce("job cleanup", func() {
|
||||
time.Sleep(30 * time.Minute)
|
||||
m.mu.Lock()
|
||||
delete(m.jobs, id)
|
||||
m.mu.Unlock()
|
||||
})
|
||||
return j
|
||||
}
|
||||
|
||||
// isDone returns true if the job has finished (either successfully or with error).
|
||||
func (j *jobState) isDone() bool {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
return j.done
|
||||
}
|
||||
|
||||
func (m *jobManager) get(id string) (*jobState, bool) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
j, ok := m.jobs[id]
|
||||
return j, ok
|
||||
}
|
||||
|
||||
func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||
j := &jobState{logPath: logPath}
|
||||
if len(serialPrefix) > 0 {
|
||||
j.serialPrefix = serialPrefix[0]
|
||||
}
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
data, err := readTaskLogFile(logPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return j
|
||||
}
|
||||
lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
|
||||
if len(lines) > 0 && lines[len(lines)-1] == "" {
|
||||
lines = lines[:len(lines)-1]
|
||||
}
|
||||
j.lines = append(j.lines, lines...)
|
||||
return j
|
||||
}
|
||||
|
||||
func appendJobLog(path, line string) {
|
||||
if path == "" {
|
||||
return
|
||||
}
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
_, _ = f.WriteString(line + "\n")
|
||||
}
|
||||
242
audit/internal/webui/kmsg_watcher.go
Normal file
242
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,242 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||
// while any SAT task is running, and flushed when all tasks complete.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeCount int // number of in-flight SAT tasks
|
||||
window *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
targets []string // SAT targets running concurrently
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
for {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
time.Sleep(30 * time.Second)
|
||||
continue
|
||||
}
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
_ = f.Close()
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||
// if this is the first task starting.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if w.activeCount == 0 {
|
||||
w.window = &kmsgWindow{
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
w.activeCount++
|
||||
if w.window != nil {
|
||||
w.window.targets = append(w.window.targets, target)
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||
// it flushes the accumulated events to the status DB.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
w.activeCount--
|
||||
var window *kmsgWindow
|
||||
if w.activeCount <= 0 {
|
||||
w.activeCount = 0
|
||||
window = w.window
|
||||
w.window = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
443
audit/internal/webui/metricsdb.go
Normal file
443
audit/internal/webui/metricsdb.go
Normal file
@@ -0,0 +1,443 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||
|
||||
// MetricsDB persists live metric samples to SQLite.
|
||||
type MetricsDB struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
func (m *MetricsDB) Close() error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
return m.db.Close()
|
||||
}
|
||||
|
||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db.SetMaxOpenConns(1)
|
||||
if err := initMetricsSchema(db); err != nil {
|
||||
_ = db.Close()
|
||||
return nil, err
|
||||
}
|
||||
return &MetricsDB{db: db}, nil
|
||||
}
|
||||
|
||||
func initMetricsSchema(db *sql.DB) error {
|
||||
_, err := db.Exec(`
|
||||
CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
gpu_index INTEGER NOT NULL,
|
||||
temp_c REAL,
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
clock_mhz REAL,
|
||||
mem_clock_mhz REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
rpm REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
grp TEXT NOT NULL,
|
||||
celsius REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
||||
}
|
||||
|
||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||
rows, err := db.Query("PRAGMA table_info(" + table + ")")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var cid int
|
||||
var name, ctype string
|
||||
var notNull, pk int
|
||||
var dflt sql.NullString
|
||||
if err := rows.Scan(&cid, &name, &ctype, ¬Null, &dflt, &pk); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.EqualFold(name, column) {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
|
||||
return err
|
||||
}
|
||||
|
||||
// Write inserts one sample into all relevant tables.
|
||||
func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
ts := s.Timestamp.Unix()
|
||||
tx, err := m.db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
|
||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, f := range s.Fans {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
|
||||
ts, f.Name, f.RPM,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
|
||||
ts, t.Name, t.Group, t.Celsius,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||
// the overall shape of every chart.
|
||||
//
|
||||
// Called hourly by the metrics collector background goroutine.
|
||||
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
start := deleteOlderThan.Unix()
|
||||
end := downsampleBefore.Unix()
|
||||
if end <= start {
|
||||
return nil
|
||||
}
|
||||
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
_, err := m.db.Exec(`
|
||||
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||
AND ts NOT IN (
|
||||
SELECT MIN(ts) FROM `+table+`
|
||||
WHERE ts >= ? AND ts < ?
|
||||
GROUP BY ts / 60
|
||||
)`, start, end, start, end)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||
func (m *MetricsDB) Prune(before time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
cutTS := before.Unix()
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// LoadBetween returns samples in chronological order within the given time window.
|
||||
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
if m == nil {
|
||||
return nil, nil
|
||||
}
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil, nil
|
||||
}
|
||||
if end.Before(start) {
|
||||
start, end = end, start
|
||||
}
|
||||
return m.loadSamples(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
start.Unix(), end.Unix(),
|
||||
)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
}
|
||||
var sysRows []sysRow
|
||||
for rows.Next() {
|
||||
var r sysRow
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||
continue
|
||||
}
|
||||
sysRows = append(sysRows, r)
|
||||
}
|
||||
if len(sysRows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
// Collect min/max ts for range query
|
||||
minTS := sysRows[0].ts
|
||||
maxTS := sysRows[len(sysRows)-1].ts
|
||||
|
||||
// Load GPU rows in range
|
||||
type gpuKey struct {
|
||||
ts int64
|
||||
idx int
|
||||
}
|
||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||
gRows, err := m.db.Query(
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer gRows.Close()
|
||||
for gRows.Next() {
|
||||
var ts int64
|
||||
var g platform.GPUMetricRow
|
||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
|
||||
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load fan rows in range
|
||||
type fanKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
fanData := map[fanKey]float64{}
|
||||
fRows, err := m.db.Query(
|
||||
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer fRows.Close()
|
||||
for fRows.Next() {
|
||||
var ts int64
|
||||
var name string
|
||||
var rpm float64
|
||||
if err := fRows.Scan(&ts, &name, &rpm); err == nil {
|
||||
fanData[fanKey{ts, name}] = rpm
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load temp rows in range
|
||||
type tempKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
tempData := map[tempKey]platform.TempReading{}
|
||||
tRows, err := m.db.Query(
|
||||
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer tRows.Close()
|
||||
for tRows.Next() {
|
||||
var ts int64
|
||||
var t platform.TempReading
|
||||
if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
|
||||
tempData[tempKey{ts, t.Name}] = t
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||
// Sort each list so that sample reconstruction is deterministic regardless
|
||||
// of Go's non-deterministic map iteration order.
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
if !seenGPU[k.idx] {
|
||||
seenGPU[k.idx] = true
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
sort.Ints(gpuIndices)
|
||||
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
if !seenFan[k.name] {
|
||||
seenFan[k.name] = true
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(fanNames)
|
||||
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
if !seenTemp[k.name] {
|
||||
seenTemp[k.name] = true
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(tempNames)
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
s := platform.LiveMetricSample{
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||
s.GPUs = append(s.GPUs, g)
|
||||
}
|
||||
}
|
||||
for _, name := range fanNames {
|
||||
if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
|
||||
s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
|
||||
}
|
||||
}
|
||||
for _, name := range tempNames {
|
||||
if t, ok := tempData[tempKey{r.ts, name}]; ok {
|
||||
s.Temps = append(s.Temps, t)
|
||||
}
|
||||
}
|
||||
samples[i] = s
|
||||
}
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
rows, err := m.db.Query(`
|
||||
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
|
||||
g.clock_mhz, g.mem_clock_mhz
|
||||
FROM sys_metrics s
|
||||
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||
ORDER BY s.ts, g.gpu_index
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
cw := csv.NewWriter(w)
|
||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
|
||||
for rows.Next() {
|
||||
var ts int64
|
||||
var cpu, mem, pwr float64
|
||||
var gpuIdx sql.NullInt64
|
||||
var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
|
||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
|
||||
continue
|
||||
}
|
||||
row := []string{
|
||||
strconv.FormatInt(ts, 10),
|
||||
strconv.FormatFloat(cpu, 'f', 2, 64),
|
||||
strconv.FormatFloat(mem, 'f', 2, 64),
|
||||
strconv.FormatFloat(pwr, 'f', 1, 64),
|
||||
}
|
||||
if gpuIdx.Valid {
|
||||
row = append(row,
|
||||
strconv.FormatInt(gpuIdx.Int64, 10),
|
||||
strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
|
||||
)
|
||||
} else {
|
||||
row = append(row, "", "", "", "", "", "", "")
|
||||
}
|
||||
_ = cw.Write(row)
|
||||
}
|
||||
cw.Flush()
|
||||
return cw.Error()
|
||||
}
|
||||
|
||||
func nullFloat(v float64) sql.NullFloat64 {
|
||||
return sql.NullFloat64{Float64: v, Valid: true}
|
||||
}
|
||||
174
audit/internal/webui/metricsdb_test.go
Normal file
174
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 3; i++ {
|
||||
err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||
CPULoadPct: float64(10 + i),
|
||||
MemLoadPct: float64(20 + i),
|
||||
PowerW: float64(300 + i),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
all, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(all) != 3 {
|
||||
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||
}
|
||||
for i, sample := range all {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||
}
|
||||
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||
}
|
||||
}
|
||||
|
||||
recent, err := db.LoadRecent(2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadRecent: %v", err)
|
||||
}
|
||||
if len(recent) != 2 {
|
||||
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||
}
|
||||
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||
}
|
||||
for i, sample := range recent {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "metrics.db")
|
||||
raw, err := sql.Open("sqlite", path)
|
||||
if err != nil {
|
||||
t.Fatalf("sql.Open: %v", err)
|
||||
}
|
||||
_, err = raw.Exec(`
|
||||
CREATE TABLE gpu_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
gpu_index INTEGER NOT NULL,
|
||||
temp_c REAL,
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE sys_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE fan_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
rpm REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
CREATE TABLE temp_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
grp TEXT NOT NULL,
|
||||
celsius REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
t.Fatalf("create legacy schema: %v", err)
|
||||
}
|
||||
_ = raw.Close()
|
||||
|
||||
db, err := openMetricsDB(path)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
now := time.Unix(1_700_000_100, 0).UTC()
|
||||
err = db.Write(platform.LiveMetricSample{
|
||||
Timestamp: now,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
|
||||
samples, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
|
||||
t.Fatalf("samples=%+v", samples)
|
||||
}
|
||||
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
|
||||
t.Fatalf("ClockMHz=%v want 1410", got)
|
||||
}
|
||||
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
|
||||
t.Fatalf("MemClockMHz=%v want 2600", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 5; i++ {
|
||||
if err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Minute),
|
||||
CPULoadPct: float64(i),
|
||||
}); err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
|
||||
if err != nil {
|
||||
t.Fatalf("LoadBetween: %v", err)
|
||||
}
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("LoadBetween len=%d want 3", len(got))
|
||||
}
|
||||
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
|
||||
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
|
||||
}
|
||||
}
|
||||
3994
audit/internal/webui/pages.go
Normal file
3994
audit/internal/webui/pages.go
Normal file
File diff suppressed because it is too large
Load Diff
41
audit/internal/webui/serial_console.go
Normal file
41
audit/internal/webui/serial_console.go
Normal file
@@ -0,0 +1,41 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var taskSerialWriteLine = writeTaskSerialLine
|
||||
|
||||
func writeTaskSerialLine(line string) {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
return
|
||||
}
|
||||
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
|
||||
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
|
||||
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
_, _ = f.WriteString(payload)
|
||||
_ = f.Close()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func taskSerialPrefix(t *Task) string {
|
||||
if t == nil {
|
||||
return "[task] "
|
||||
}
|
||||
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
|
||||
}
|
||||
|
||||
func taskSerialEvent(t *Task, event string) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
|
||||
}
|
||||
1451
audit/internal/webui/server.go
Normal file
1451
audit/internal/webui/server.go
Normal file
File diff suppressed because it is too large
Load Diff
1157
audit/internal/webui/server_test.go
Normal file
1157
audit/internal/webui/server_test.go
Normal file
File diff suppressed because it is too large
Load Diff
71
audit/internal/webui/stability.go
Normal file
71
audit/internal/webui/stability.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"runtime/debug"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
recoverLoopMaxDelay = 60 * time.Second
|
||||
recoverLoopResetAfter = 30 * time.Second
|
||||
)
|
||||
|
||||
// goRecoverLoop starts fn in a goroutine, restarting after panics.
|
||||
// restartDelay is the initial delay; successive panics double it up to
|
||||
// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
|
||||
// successfully for recoverLoopResetAfter without panicking.
|
||||
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||
go func() {
|
||||
delay := restartDelay
|
||||
consecutive := 0
|
||||
for {
|
||||
start := time.Now()
|
||||
panicked := runRecoverable(name, fn)
|
||||
if !panicked {
|
||||
return
|
||||
}
|
||||
consecutive++
|
||||
if time.Since(start) >= recoverLoopResetAfter {
|
||||
delay = restartDelay
|
||||
consecutive = 1
|
||||
}
|
||||
slog.Warn("goroutine restarting after panic",
|
||||
"component", name,
|
||||
"consecutive_panics", consecutive,
|
||||
"next_delay", delay,
|
||||
)
|
||||
if delay > 0 {
|
||||
time.Sleep(delay)
|
||||
}
|
||||
if delay < recoverLoopMaxDelay {
|
||||
delay *= 2
|
||||
if delay > recoverLoopMaxDelay {
|
||||
delay = recoverLoopMaxDelay
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func goRecoverOnce(name string, fn func()) {
|
||||
go func() {
|
||||
_ = runRecoverable(name, fn)
|
||||
}()
|
||||
}
|
||||
|
||||
func runRecoverable(name string, fn func()) (panicked bool) {
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
panicked = true
|
||||
slog.Error("recovered panic",
|
||||
"component", name,
|
||||
"panic", fmt.Sprint(rec),
|
||||
"stack", string(debug.Stack()),
|
||||
)
|
||||
}
|
||||
}()
|
||||
fn()
|
||||
return false
|
||||
}
|
||||
267
audit/internal/webui/task_page.go
Normal file
267
audit/internal/webui/task_page.go
Normal file
@@ -0,0 +1,267 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
task, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
snapshot := *task
|
||||
body := renderTaskDetailPage(h.opts, snapshot)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write([]byte(body))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
type taskChartIndexEntry struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
entries := make([]taskChartIndexEntry, 0)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
_ = json.NewEncoder(w).Encode(entries)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
|
||||
path, ok := taskChartPathFromFile(file)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
|
||||
if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||
title := task.Name
|
||||
if strings.TrimSpace(title) == "" {
|
||||
title = task.ID
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
|
||||
}
|
||||
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||
body.WriteString(`</div>`)
|
||||
|
||||
if report := loadTaskReportFragment(task); report != "" {
|
||||
body.WriteString(report)
|
||||
} else {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
|
||||
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
|
||||
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
|
||||
if strings.TrimSpace(task.ErrMsg) != "" {
|
||||
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
|
||||
}
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
body.WriteString(`<script>
|
||||
function cancelTaskDetail(id) {
|
||||
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||
var term = document.getElementById('task-live-log');
|
||||
if (term) {
|
||||
term.textContent += '\nCancel requested.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
});
|
||||
}
|
||||
function renderTaskLiveCharts(taskId, charts) {
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (!host) return;
|
||||
if (!Array.isArray(charts) || charts.length === 0) {
|
||||
host.innerHTML = 'Waiting for metric samples...';
|
||||
return;
|
||||
}
|
||||
const seen = {};
|
||||
charts.forEach(function(chart) {
|
||||
seen[chart.file] = true;
|
||||
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||
if (img) {
|
||||
const card = img.closest('.card');
|
||||
if (card) {
|
||||
const title = card.querySelector('.card-head');
|
||||
if (title) title.textContent = chart.title;
|
||||
}
|
||||
return;
|
||||
}
|
||||
const card = document.createElement('div');
|
||||
card.className = 'card';
|
||||
card.style.margin = '0';
|
||||
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||
card.querySelector('.card-head').textContent = chart.title;
|
||||
const body = card.querySelector('.card-body');
|
||||
img = document.createElement('img');
|
||||
img.setAttribute('data-task-chart', '1');
|
||||
img.setAttribute('data-chart-file', chart.file);
|
||||
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||
img.style.width = '100%';
|
||||
img.style.display = 'block';
|
||||
img.style.borderRadius = '6px';
|
||||
img.alt = chart.title;
|
||||
body.appendChild(img);
|
||||
host.appendChild(card);
|
||||
});
|
||||
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||
const file = img.getAttribute('data-chart-file') || '';
|
||||
if (seen[file]) return;
|
||||
const card = img.closest('.card');
|
||||
if (card) card.remove();
|
||||
});
|
||||
}
|
||||
function loadTaskLiveCharts(taskId) {
|
||||
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||
renderTaskLiveCharts(taskId, charts);
|
||||
}).catch(function(){
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||
});
|
||||
}
|
||||
function refreshTaskLiveCharts() {
|
||||
document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
|
||||
const base = img.dataset.baseSrc;
|
||||
if (!base) return;
|
||||
img.src = base + '?t=' + Date.now();
|
||||
});
|
||||
}
|
||||
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||
var _taskChartTimer = null;
|
||||
var _taskChartsFrozen = false;
|
||||
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||
_taskDetailES.addEventListener('done', function(e){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
_taskChartsFrozen = true;
|
||||
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||
refreshTaskLiveCharts();
|
||||
});
|
||||
_taskDetailES.onerror = function(){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
if (_taskDetailES) {
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
}
|
||||
};
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
_taskChartTimer = setInterval(function(){
|
||||
if (_taskChartsFrozen) return;
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
refreshTaskLiveCharts();
|
||||
}, 2000);
|
||||
</script>`)
|
||||
}
|
||||
|
||||
return layoutHead(opts.Title+" — "+title) +
|
||||
layoutNav("tasks", opts.BuildLabel) +
|
||||
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||
body.String() +
|
||||
`</div></div></body></html>`
|
||||
}
|
||||
|
||||
func loadTaskReportFragment(task Task) string {
|
||||
if strings.TrimSpace(task.ReportHTMLPath) == "" {
|
||||
return ""
|
||||
}
|
||||
data, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return ""
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func taskArtifactDownloadLink(task Task, absPath string) string {
|
||||
if strings.TrimSpace(absPath) == "" {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||
}
|
||||
|
||||
func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
|
||||
id := r.PathValue("id")
|
||||
taskPtr, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
return Task{}, nil, time.Time{}, time.Time{}, false
|
||||
}
|
||||
task := *taskPtr
|
||||
start, end := taskTimeWindow(&task)
|
||||
samples, err := loadTaskMetricSamples(start, end)
|
||||
if err != nil {
|
||||
return task, nil, start, end, true
|
||||
}
|
||||
return task, samples, start, end, true
|
||||
}
|
||||
|
||||
func taskTimelineForTask(task Task) []chartTimelineSegment {
|
||||
start, end := taskTimeWindow(&task)
|
||||
return []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
}
|
||||
|
||||
func taskChartPathFromFile(file string) (string, bool) {
|
||||
file = strings.TrimSpace(file)
|
||||
for _, spec := range taskDashboardChartSpecs {
|
||||
if spec.File == file {
|
||||
return spec.Path, true
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
|
||||
id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
|
||||
return "gpu/" + id + "-overview", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
371
audit/internal/webui/task_report.go
Normal file
371
audit/internal/webui/task_report.go
Normal file
@@ -0,0 +1,371 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
var taskReportMetricsDBPath = metricsDBPath
|
||||
|
||||
type taskReport struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
DurationSec int `json:"duration_sec,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
LogFile string `json:"log_file,omitempty"`
|
||||
Charts []taskReportChart `json:"charts,omitempty"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
}
|
||||
|
||||
type taskReportChart struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
|
||||
type taskChartSpec struct {
|
||||
Path string
|
||||
File string
|
||||
}
|
||||
|
||||
var taskDashboardChartSpecs = []taskChartSpec{
|
||||
{Path: "server-load", File: "server-load.svg"},
|
||||
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
|
||||
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
|
||||
{Path: "server-power", File: "server-power.svg"},
|
||||
{Path: "server-fans", File: "server-fans.svg"},
|
||||
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
|
||||
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
|
||||
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
|
||||
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
|
||||
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||
}
|
||||
|
||||
func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
|
||||
specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
|
||||
specs = append(specs, taskDashboardChartSpecs...)
|
||||
for _, idx := range taskGPUIndices(samples) {
|
||||
specs = append(specs, taskChartSpec{
|
||||
Path: fmt.Sprintf("gpu/%d-overview", idx),
|
||||
File: fmt.Sprintf("gpu-%d-overview.svg", idx),
|
||||
})
|
||||
}
|
||||
return specs
|
||||
}
|
||||
|
||||
func writeTaskReportArtifacts(t *Task) error {
|
||||
if t == nil {
|
||||
return nil
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
start, end := taskTimeWindow(t)
|
||||
samples, _ := loadTaskMetricSamples(start, end)
|
||||
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
|
||||
|
||||
logText := ""
|
||||
if data, err := os.ReadFile(t.LogPath); err == nil {
|
||||
logText = string(data)
|
||||
}
|
||||
|
||||
report := taskReport{
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
|
||||
Error: t.ErrMsg,
|
||||
LogFile: filepath.Base(t.LogPath),
|
||||
Charts: charts,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
}
|
||||
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
|
||||
}
|
||||
|
||||
func reportDoneTime(t *Task) time.Time {
|
||||
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
return *t.DoneAt
|
||||
}
|
||||
return time.Now()
|
||||
}
|
||||
|
||||
func taskTimeWindow(t *Task) (time.Time, time.Time) {
|
||||
if t == nil {
|
||||
now := time.Now().UTC()
|
||||
return now, now
|
||||
}
|
||||
start := t.CreatedAt.UTC()
|
||||
if t.StartedAt != nil && !t.StartedAt.IsZero() {
|
||||
start = t.StartedAt.UTC()
|
||||
}
|
||||
end := time.Now().UTC()
|
||||
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
end = t.DoneAt.UTC()
|
||||
}
|
||||
if end.Before(start) {
|
||||
end = start
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
db, err := openMetricsDB(taskReportMetricsDBPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer db.Close()
|
||||
return db.LoadBetween(start, end)
|
||||
}
|
||||
|
||||
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
|
||||
if len(samples) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
var charts []taskReportChart
|
||||
inline := make(map[string]string)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||
if !ok || len(svg) == 0 {
|
||||
continue
|
||||
}
|
||||
path := filepath.Join(dir, spec.File)
|
||||
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||
inline[spec.File] = string(svg)
|
||||
}
|
||||
return charts, inline
|
||||
}
|
||||
|
||||
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||
buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
if err != nil || !hasData {
|
||||
return "", nil, false
|
||||
}
|
||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
return "", nil, false
|
||||
}
|
||||
var buf []byte
|
||||
var err error
|
||||
if stacked {
|
||||
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
} else {
|
||||
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||
}
|
||||
if err != nil {
|
||||
return "", nil, false
|
||||
}
|
||||
return title, buf, true
|
||||
}
|
||||
|
||||
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
|
||||
seen := map[int]bool{}
|
||||
var out []int
|
||||
for _, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if seen[g.GPUIndex] {
|
||||
continue
|
||||
}
|
||||
seen[g.GPUIndex] = true
|
||||
out = append(out, g.GPUIndex)
|
||||
}
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func writeJSONFile(path string, v any) error {
|
||||
data, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
|
||||
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="grid2">`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
|
||||
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
|
||||
if strings.TrimSpace(report.Error) != "" {
|
||||
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
|
||||
}
|
||||
b.WriteString(`</div></div>`)
|
||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||
b.WriteString(`</div></div></div>`)
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||
b.WriteString(charts[chart.File])
|
||||
b.WriteString(`</div></div>`)
|
||||
}
|
||||
} else {
|
||||
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||
}
|
||||
|
||||
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia-bench-perf":
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||
if len(runs) == 0 {
|
||||
return ""
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Perf Results",
|
||||
"Composite score for this benchmark task.",
|
||||
"No benchmark results were saved for this task.",
|
||||
columns,
|
||||
runs,
|
||||
)
|
||||
}
|
||||
|
||||
func renderTaskPowerResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
raw, err := os.ReadFile(resultPath)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
var result platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||
}
|
||||
b.WriteString(`</table></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
func taskArchivePathFromLog(logText string) string {
|
||||
lines := strings.Split(logText, "\n")
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.HasPrefix(path, "Archive written to ") {
|
||||
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||
}
|
||||
if strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func renderTaskStatusBadge(status string) string {
|
||||
className := map[string]string{
|
||||
TaskRunning: "badge-ok",
|
||||
TaskPending: "badge-unknown",
|
||||
TaskDone: "badge-ok",
|
||||
TaskFailed: "badge-err",
|
||||
TaskCancelled: "badge-unknown",
|
||||
}[status]
|
||||
if className == "" {
|
||||
className = "badge-unknown"
|
||||
}
|
||||
label := strings.TrimSpace(status)
|
||||
if label == "" {
|
||||
label = "unknown"
|
||||
}
|
||||
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
|
||||
}
|
||||
|
||||
func formatTaskTime(ts *time.Time, fallback time.Time) string {
|
||||
if ts != nil && !ts.IsZero() {
|
||||
return ts.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
if !fallback.IsZero() {
|
||||
return fallback.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
return "n/a"
|
||||
}
|
||||
|
||||
func formatTaskDuration(sec int) string {
|
||||
if sec <= 0 {
|
||||
return "n/a"
|
||||
}
|
||||
if sec < 60 {
|
||||
return fmt.Sprintf("%ds", sec)
|
||||
}
|
||||
if sec < 3600 {
|
||||
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
|
||||
}
|
||||
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||
}
|
||||
1350
audit/internal/webui/tasks.go
Normal file
1350
audit/internal/webui/tasks.go
Normal file
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user