Compare commits
182 Commits
audit/v1.0
...
v5.5
| Author | SHA1 | Date | |
|---|---|---|---|
| e609fbbc26 | |||
| cc2b49ea41 | |||
| 33e0a5bef2 | |||
| 38e79143eb | |||
| 25af2df23a | |||
| 20abff7f90 | |||
| a14ec8631c | |||
| f58c7e58d3 | |||
| bf47c8dbd2 | |||
| 143b7dca5d | |||
| 9826d437a5 | |||
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 | ||
|
|
fc7fe0b08e | ||
|
|
3cf75a541a | ||
|
|
1f750d3edd | ||
|
|
b2b0444131 | ||
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 | ||
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac | |||
| 3dbc2184ef | |||
| 60cb8f889a | |||
| c9ee078622 | |||
| ea660500c9 | |||
| d43a9aeec7 | |||
|
|
f5622e351e | ||
|
|
a20806afc8 | ||
|
|
4f9b6b3bcd | ||
|
|
c850b39b01 | ||
|
|
6dee8f3509 | ||
|
|
20f834aa96 | ||
| 105d92df8b | |||
| f96b149875 | |||
| 5ee120158e | |||
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 | |||
| 4db89e9773 | |||
| 3fda18f708 | |||
| ea518abf30 | |||
| 744de588bb | |||
| a3ed9473a3 | |||
| a714c45f10 | |||
| 349e026cfa | |||
| 889fe1dc2f | |||
| befdbf3768 | |||
| ec6a0b292d | |||
| a03312c286 | |||
| e69e9109da | |||
| 413869809d | |||
| f9bd38572a | |||
| 662e3d2cdd | |||
| 126af96780 | |||
| ada15ac777 | |||
| dfb94f9ca6 | |||
| 5857805518 | |||
| 59a1d4b209 | |||
| 0dbfaf6121 | |||
| 5d72d48714 | |||
| 096b4a09ca | |||
| 5d42a92e4c | |||
| 3e54763367 | |||
| f91bce8661 | |||
| 585e6d7311 | |||
| 0a98ed8ae9 | |||
| 911745e4da | |||
| acfd2010d7 | |||
| e904c13790 | |||
| 24c5c72cee | |||
| 6ff0bcad56 | |||
| 4fef26000c | |||
| a393dcb731 | |||
| 9e55728053 | |||
| 4b8023c1cb | |||
| 4c8417d20a | |||
| 0755374dd2 | |||
| c70ae274fa | |||
| 23ad7ff534 | |||
| de130966f7 | |||
| c6fbfc8306 | |||
| 35ad1c74d9 | |||
| 4a02e74b17 | |||
| cd2853ad99 | |||
| 6caf771d6e | |||
| 14fa87b7d7 | |||
| 600ece911b | |||
| 2d424c63cb | |||
| 50f28d1ee6 | |||
| 3579747ae3 | |||
| 09dc7d2613 | |||
| ec0b7f7ff9 | |||
| e7a7ff54b9 | |||
| b4371e291e | |||
| c22b53a406 | |||
| ff0acc3698 | |||
| d50760e7c6 | |||
| ed4f8be019 | |||
| 883592d029 | |||
| a6dcaf1c7e | |||
| 88727fb590 | |||
| c9f5224c42 | |||
| 7cb5c02a9b | |||
| c1aa3cf491 | |||
| f7eb75c57c | |||
| 004cc4910d | |||
| ed1cceed8c | |||
| 9fe9f061f8 | |||
| 837a1fb981 | |||
| 1f43b4e050 | |||
| 83bbc8a1bc | |||
| 896bdb6ee8 | |||
| 5407c26e25 | |||
| 4fddaba9c5 | |||
| d2f384b6eb | |||
| 25f0f30aaf | |||
| a57b037a91 | |||
| 5644231f9a | |||
| eea98e6d76 | |||
| 967455194c | |||
| 79dabf3efb | |||
| 1336f5b95c | |||
| 31486a31c1 | |||
| aa3fc332ba | |||
| 62c57b87f2 | |||
| f600261546 | |||
| d7ca04bdfb | |||
| 5433652c70 | |||
| b25f014dbd | |||
| d69a46f211 | |||
|
|
fc5c2019aa | ||
|
|
67a215c66f | ||
|
|
8b4bfdf5ad | ||
|
|
0a52a4f3ba | ||
|
|
b132f7973a | ||
|
|
bd94b6c792 | ||
|
|
06017eddfd | ||
|
|
0ac7b6a963 | ||
|
|
3d2ae4cdcb | ||
|
|
4669f14f4f | ||
|
|
540a9e39b8 | ||
|
|
58510207fa |
4
PLAN.md
4
PLAN.md
@@ -343,9 +343,9 @@ Planned code shape:
|
|||||||
- `bee tui` can rerun the audit manually
|
- `bee tui` can rerun the audit manually
|
||||||
- `bee tui` can export the latest audit JSON to removable media
|
- `bee tui` can export the latest audit JSON to removable media
|
||||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
|
||||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||||
|
|
||||||
### 2.6 — Vendor utilities and optional assets
|
### 2.6 — Vendor utilities and optional assets
|
||||||
|
|||||||
22
audit/Makefile
Normal file
22
audit/Makefile
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
LISTEN ?= :8080
|
||||||
|
AUDIT_PATH ?=
|
||||||
|
EXPORT_DIR ?= $(CURDIR)/.tmp/export
|
||||||
|
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||||
|
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||||
|
|
||||||
|
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
|
||||||
|
ifneq ($(AUDIT_PATH),)
|
||||||
|
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||||
|
endif
|
||||||
|
|
||||||
|
.PHONY: run build test
|
||||||
|
|
||||||
|
run:
|
||||||
|
mkdir -p $(EXPORT_DIR)
|
||||||
|
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||||
|
|
||||||
|
build:
|
||||||
|
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||||
|
|
||||||
|
test:
|
||||||
|
go test ./...
|
||||||
@@ -1,30 +1,49 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"runtime/debug"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
"bee/audit/internal/runtimeenv"
|
"bee/audit/internal/runtimeenv"
|
||||||
"bee/audit/internal/tui"
|
|
||||||
"bee/audit/internal/webui"
|
"bee/audit/internal/webui"
|
||||||
)
|
)
|
||||||
|
|
||||||
var Version = "dev"
|
var Version = "dev"
|
||||||
|
|
||||||
|
func buildLabel() string {
|
||||||
|
label := strings.TrimSpace(Version)
|
||||||
|
if label == "" {
|
||||||
|
return "dev"
|
||||||
|
}
|
||||||
|
return label
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||||
}
|
}
|
||||||
|
|
||||||
func run(args []string, stdout, stderr io.Writer) int {
|
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||||
Level: slog.LevelInfo,
|
Level: slog.LevelInfo,
|
||||||
})))
|
})))
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
slog.Error("fatal panic",
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
exitCode = 1
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
printRootUsage(stderr)
|
printRootUsage(stderr)
|
||||||
@@ -40,8 +59,6 @@ func run(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
case "audit":
|
case "audit":
|
||||||
return runAudit(args[1:], stdout, stderr)
|
return runAudit(args[1:], stdout, stderr)
|
||||||
case "tui":
|
|
||||||
return runTUI(args[1:], stdout, stderr)
|
|
||||||
case "export":
|
case "export":
|
||||||
return runExport(args[1:], stdout, stderr)
|
return runExport(args[1:], stdout, stderr)
|
||||||
case "preflight":
|
case "preflight":
|
||||||
@@ -52,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runWeb(args[1:], stdout, stderr)
|
return runWeb(args[1:], stdout, stderr)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT(args[1:], stdout, stderr)
|
return runSAT(args[1:], stdout, stderr)
|
||||||
|
case "benchmark":
|
||||||
|
return runBenchmark(args[1:], stdout, stderr)
|
||||||
case "version", "--version", "-version":
|
case "version", "--version", "-version":
|
||||||
fmt.Fprintln(stdout, Version)
|
fmt.Fprintln(stdout, Version)
|
||||||
return 0
|
return 0
|
||||||
@@ -66,11 +85,11 @@ func printRootUsage(w io.Writer) {
|
|||||||
fmt.Fprintln(w, `bee commands:
|
fmt.Fprintln(w, `bee commands:
|
||||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||||
bee preflight --output stdout|file:<path>
|
bee preflight --output stdout|file:<path>
|
||||||
bee tui --runtime auto|local|livecd
|
|
||||||
bee export --target <device>
|
bee export --target <device>
|
||||||
bee support-bundle --output stdout|file:<path>
|
bee support-bundle --output stdout|file:<path>
|
||||||
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||||
|
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||||
bee version
|
bee version
|
||||||
bee help [command]`)
|
bee help [command]`)
|
||||||
}
|
}
|
||||||
@@ -79,8 +98,6 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
|||||||
switch args[0] {
|
switch args[0] {
|
||||||
case "audit":
|
case "audit":
|
||||||
return runAudit([]string{"--help"}, stdout, stdout)
|
return runAudit([]string{"--help"}, stdout, stdout)
|
||||||
case "tui":
|
|
||||||
return runTUI([]string{"--help"}, stdout, stdout)
|
|
||||||
case "export":
|
case "export":
|
||||||
return runExport([]string{"--help"}, stdout, stdout)
|
return runExport([]string{"--help"}, stdout, stdout)
|
||||||
case "preflight":
|
case "preflight":
|
||||||
@@ -91,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runWeb([]string{"--help"}, stdout, stdout)
|
return runWeb([]string{"--help"}, stdout, stdout)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT([]string{"--help"}, stdout, stderr)
|
return runSAT([]string{"--help"}, stdout, stderr)
|
||||||
|
case "benchmark":
|
||||||
|
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||||
case "version":
|
case "version":
|
||||||
fmt.Fprintln(stdout, "usage: bee version")
|
fmt.Fprintln(stdout, "usage: bee version")
|
||||||
return 0
|
return 0
|
||||||
@@ -145,43 +164,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func runTUI(args []string, stdout, stderr io.Writer) int {
|
|
||||||
fs := flag.NewFlagSet("tui", flag.ContinueOnError)
|
|
||||||
fs.SetOutput(stderr)
|
|
||||||
runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
|
|
||||||
fs.Usage = func() {
|
|
||||||
fmt.Fprintln(stderr, "usage: bee tui [--runtime auto|local|livecd]")
|
|
||||||
fs.PrintDefaults()
|
|
||||||
}
|
|
||||||
if err := fs.Parse(args); err != nil {
|
|
||||||
if err == flag.ErrHelp {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
return 2
|
|
||||||
}
|
|
||||||
if fs.NArg() != 0 {
|
|
||||||
fs.Usage()
|
|
||||||
return 2
|
|
||||||
}
|
|
||||||
|
|
||||||
runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("resolve runtime", "err", err)
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.SetDefault(slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{
|
|
||||||
Level: slog.LevelInfo,
|
|
||||||
})))
|
|
||||||
|
|
||||||
application := app.New(platform.New())
|
|
||||||
if err := tui.Run(application, runtimeInfo.Mode); err != nil {
|
|
||||||
slog.Error("run tui", "err", err)
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
func runExport(args []string, stdout, stderr io.Writer) int {
|
func runExport(args []string, stdout, stderr io.Writer) int {
|
||||||
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
@@ -314,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||||
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
|
||||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||||
title := fs.String("title", "Bee Hardware Audit", "page title")
|
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||||
fs.Usage = func() {
|
fs.Usage = func() {
|
||||||
@@ -333,10 +315,19 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
|
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
|
||||||
|
|
||||||
|
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("resolve runtime for web", "err", err)
|
||||||
|
}
|
||||||
|
|
||||||
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
||||||
Title: *title,
|
Title: *title,
|
||||||
AuditPath: *auditPath,
|
BuildLabel: buildLabel(),
|
||||||
ExportDir: *exportDir,
|
AuditPath: *auditPath,
|
||||||
|
ExportDir: *exportDir,
|
||||||
|
App: app.New(platform.New()),
|
||||||
|
RuntimeMode: runtimeInfo.Mode,
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
slog.Error("run web", "err", err)
|
slog.Error("run web", "err", err)
|
||||||
return 1
|
return 1
|
||||||
@@ -357,6 +348,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||||
|
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
||||||
if err := fs.Parse(args[1:]); err != nil {
|
if err := fs.Parse(args[1:]); err != nil {
|
||||||
if err == flag.ErrHelp {
|
if err == flag.ErrHelp {
|
||||||
return 0
|
return 0
|
||||||
@@ -371,7 +363,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
target := args[0]
|
target := args[0]
|
||||||
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
||||||
return 2
|
return 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -380,19 +372,25 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
archive string
|
archive string
|
||||||
err error
|
err error
|
||||||
)
|
)
|
||||||
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||||
switch target {
|
switch target {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
archive, err = application.RunNvidiaAcceptancePack("")
|
level := *diagLevel
|
||||||
|
if level > 0 {
|
||||||
|
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
||||||
|
} else {
|
||||||
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||||
|
}
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = application.RunMemoryAcceptancePack("")
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||||
case "storage":
|
case "storage":
|
||||||
archive, err = application.RunStorageAcceptancePack("")
|
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
dur := *duration
|
dur := *duration
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
dur = 60
|
dur = 60
|
||||||
}
|
}
|
||||||
archive, err = application.RunCPUAcceptancePack("", dur)
|
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Error("run sat", "target", target, "err", err)
|
slog.Error("run sat", "target", target, "err", err)
|
||||||
@@ -401,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
slog.Info("sat archive written", "target", target, "path", archive)
|
slog.Info("sat archive written", "target", target, "path", archive)
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||||
|
if len(args) == 0 {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||||
|
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
target := args[0]
|
||||||
|
if target != "nvidia" {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
|
||||||
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
|
||||||
|
devices := fs.String("devices", "", "comma-separated GPU indices to include")
|
||||||
|
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
|
||||||
|
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
|
||||||
|
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
|
||||||
|
if err := fs.Parse(args[1:]); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
includeIndices, err := parseBenchmarkIndexCSV(*devices)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
application := app.New(platform.New())
|
||||||
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||||
|
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: *profile,
|
||||||
|
SizeMB: *sizeMB,
|
||||||
|
GPUIndices: includeIndices,
|
||||||
|
ExcludeGPUIndices: excludeIndices,
|
||||||
|
RunNCCL: !*skipNCCL,
|
||||||
|
}, logLine)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("run benchmark", "target", target, "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
slog.Info("benchmark archive written", "target", target, "path", archive)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for _, part := range strings.Split(raw, ",") {
|
||||||
|
part = strings.TrimSpace(part)
|
||||||
|
if part == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(part)
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return nil, fmt.Errorf("bad gpu index %q", part)
|
||||||
|
}
|
||||||
|
indices = append(indices, value)
|
||||||
|
}
|
||||||
|
return indices, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunVersion(t *testing.T) {
|
func TestRunVersion(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
old := Version
|
old := Version
|
||||||
Version = "test-version"
|
Version = "test-version"
|
||||||
t.Cleanup(func() { Version = old })
|
t.Cleanup(func() { Version = old })
|
||||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||||
|
old := Version
|
||||||
|
Version = "1.2.3"
|
||||||
|
t.Cleanup(func() { Version = old })
|
||||||
|
|
||||||
|
if got := buildLabel(); got != "1.2.3" {
|
||||||
|
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunExportRequiresTarget(t *testing.T) {
|
func TestRunExportRequiresTarget(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
36
audit/go.mod
36
audit/go.mod
@@ -1,28 +1,26 @@
|
|||||||
module bee/audit
|
module bee/audit
|
||||||
|
|
||||||
go 1.24.0
|
go 1.25.0
|
||||||
|
|
||||||
replace reanimator/chart => ../internal/chart
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
require github.com/charmbracelet/bubbletea v1.3.4
|
require (
|
||||||
require github.com/charmbracelet/lipgloss v1.0.0
|
github.com/go-analyze/charts v0.5.26
|
||||||
require reanimator/chart v0.0.0
|
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||||
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/charmbracelet/lipgloss v1.0.0 // promoted to direct — used for TUI colors
|
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||||
github.com/charmbracelet/x/ansi v0.8.0 // indirect
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||||
github.com/charmbracelet/x/term v0.2.1 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
|
|
||||||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
|
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
github.com/mattn/go-localereader v0.0.1 // indirect
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
github.com/mattn/go-runewidth v0.0.16 // indirect
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
|
golang.org/x/image v0.24.0 // indirect
|
||||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
github.com/muesli/termenv v0.15.2 // indirect
|
modernc.org/libc v1.70.0 // indirect
|
||||||
github.com/rivo/uniseg v0.4.7 // indirect
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
golang.org/x/sync v0.11.0 // indirect
|
modernc.org/memory v1.11.0 // indirect
|
||||||
golang.org/x/sys v0.30.0 // indirect
|
modernc.org/sqlite v1.48.0 // indirect
|
||||||
golang.org/x/text v0.3.8 // indirect
|
|
||||||
)
|
)
|
||||||
|
|||||||
68
audit/go.sum
68
audit/go.sum
@@ -1,37 +1,37 @@
|
|||||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/charmbracelet/bubbletea v1.3.4 h1:kCg7B+jSCFPLYRA52SDZjr51kG/fMUEoPoZrkaDHyoI=
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
github.com/charmbracelet/bubbletea v1.3.4/go.mod h1:dtcUCyCGEX3g9tosuYiut3MXgY/Jsv9nKVdibKKRRXo=
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
github.com/charmbracelet/lipgloss v1.0.0 h1:O7VkGDvqEdGi93X+DeqsQ7PKHDgtQfF8j8/O2qFMQNg=
|
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||||
github.com/charmbracelet/lipgloss v1.0.0/go.mod h1:U5fy9Z+C38obMs+T+tJqst9VGzlOYGj4ri9reL3qUlo=
|
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||||
github.com/charmbracelet/x/ansi v0.8.0 h1:9GTq3xq9caJW8ZrBTe0LIe2fvfLR/bYXKTx2llXn7xE=
|
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||||
github.com/charmbracelet/x/ansi v0.8.0/go.mod h1:wdYl/ONOLHLIVmQaxbIYEC/cRKOQyjTkowiI4blgS9Q=
|
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||||
github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||||
github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
|
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
|
|
||||||
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
|
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
|
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||||
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
|
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
|
||||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
|
||||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
|
||||||
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
|
|
||||||
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
|
||||||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||||
|
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||||
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||||
|
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
|||||||
@@ -19,26 +19,30 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
DefaultExportDir = "/appdata/bee/export"
|
DefaultExportDir = "/appdata/bee/export"
|
||||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||||
|
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
|
||||||
)
|
)
|
||||||
|
|
||||||
type App struct {
|
type App struct {
|
||||||
network networkManager
|
network networkManager
|
||||||
services serviceManager
|
services serviceManager
|
||||||
exports exportManager
|
exports exportManager
|
||||||
tools toolManager
|
tools toolManager
|
||||||
sat satRunner
|
sat satRunner
|
||||||
runtime runtimeChecker
|
runtime runtimeChecker
|
||||||
|
installer installer
|
||||||
|
// StatusDB is the unified component health store (nil if unavailable).
|
||||||
|
StatusDB *ComponentStatusDB
|
||||||
}
|
}
|
||||||
|
|
||||||
type ActionResult struct {
|
type ActionResult struct {
|
||||||
@@ -52,10 +56,15 @@ type networkManager interface {
|
|||||||
DHCPOne(iface string) (string, error)
|
DHCPOne(iface string) (string, error)
|
||||||
DHCPAll() (string, error)
|
DHCPAll() (string, error)
|
||||||
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
||||||
|
SetInterfaceState(iface string, up bool) error
|
||||||
|
GetInterfaceState(iface string) (bool, error)
|
||||||
|
CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
|
||||||
|
RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type serviceManager interface {
|
type serviceManager interface {
|
||||||
ListBeeServices() ([]string, error)
|
ListBeeServices() ([]string, error)
|
||||||
|
ServiceState(name string) string
|
||||||
ServiceStatus(name string) (string, error)
|
ServiceStatus(name string) (string, error)
|
||||||
ServiceDo(name string, action platform.ServiceAction) (string, error)
|
ServiceDo(name string, action platform.ServiceAction) (string, error)
|
||||||
}
|
}
|
||||||
@@ -70,17 +79,64 @@ type toolManager interface {
|
|||||||
CheckTools(names []string) []platform.ToolStatus
|
CheckTools(names []string) []platform.ToolStatus
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type installer interface {
|
||||||
|
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||||
|
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||||
|
IsLiveMediaInRAM() bool
|
||||||
|
LiveBootSource() platform.LiveBootSource
|
||||||
|
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type GPUPresenceResult struct {
|
||||||
|
Nvidia bool
|
||||||
|
AMD bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DetectGPUPresence() GPUPresenceResult {
|
||||||
|
vendor := a.sat.DetectGPUVendor()
|
||||||
|
return GPUPresenceResult{
|
||||||
|
Nvidia: vendor == "nvidia",
|
||||||
|
AMD: vendor == "amd",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) IsLiveMediaInRAM() bool {
|
||||||
|
return a.installer.IsLiveMediaInRAM()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||||
|
return a.installer.LiveBootSource()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
|
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
type satRunner interface {
|
type satRunner interface {
|
||||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||||
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
|
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
|
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -89,14 +145,39 @@ type runtimeChecker interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func New(platform *platform.System) *App {
|
func New(platform *platform.System) *App {
|
||||||
return &App{
|
a := &App{
|
||||||
network: platform,
|
network: platform,
|
||||||
services: platform,
|
services: platform,
|
||||||
exports: platform,
|
exports: platform,
|
||||||
tools: platform,
|
tools: platform,
|
||||||
sat: platform,
|
sat: platform,
|
||||||
runtime: platform,
|
runtime: platform,
|
||||||
|
installer: platform,
|
||||||
}
|
}
|
||||||
|
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||||
|
a.StatusDB = db
|
||||||
|
}
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||||
|
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||||
|
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||||
|
snap, err := readAuditSnapshot(auditJSON)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||||
|
return json.MarshalIndent(snap, "", " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
||||||
|
var snap schema.HardwareIngestRequest
|
||||||
|
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||||
|
return schema.HardwareIngestRequest{}, err
|
||||||
|
}
|
||||||
|
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
||||||
|
return snap, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||||
@@ -106,7 +187,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
@@ -121,10 +202,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return path, nil
|
return path, nil
|
||||||
@@ -149,10 +227,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return path, nil
|
return path, nil
|
||||||
@@ -222,6 +297,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||||
|
data = normalized
|
||||||
|
}
|
||||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -231,8 +309,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
|||||||
|
|
||||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||||
path, err := a.ExportLatestAudit(target)
|
path, err := a.ExportLatestAudit(target)
|
||||||
body := "Audit exported."
|
body := "Audit export failed."
|
||||||
if path != "" {
|
if err == nil {
|
||||||
|
body = "Audit exported."
|
||||||
|
}
|
||||||
|
if err == nil && path != "" {
|
||||||
body = "Audit exported to " + path
|
body = "Audit exported to " + path
|
||||||
}
|
}
|
||||||
return ActionResult{Title: "Export audit", Body: body}, err
|
return ActionResult{Title: "Export audit", Body: body}, err
|
||||||
@@ -249,8 +330,11 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
|
|||||||
|
|
||||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||||
path, err := a.ExportSupportBundle(target)
|
path, err := a.ExportSupportBundle(target)
|
||||||
body := "Support bundle exported. USB target unmounted and safe to remove."
|
body := "Support bundle export failed."
|
||||||
if path != "" {
|
if err == nil {
|
||||||
|
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||||
|
}
|
||||||
|
if err == nil && path != "" {
|
||||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||||
}
|
}
|
||||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||||
@@ -286,6 +370,22 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
|||||||
return a.network.SetStaticIPv4(cfg)
|
return a.network.SetStaticIPv4(cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||||
|
return a.network.SetInterfaceState(iface, up)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||||
|
return a.network.GetInterfaceState(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return a.network.CaptureNetworkSnapshot()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||||
|
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||||
body, err := a.network.SetStaticIPv4(cfg)
|
body, err := a.network.SetStaticIPv4(cfg)
|
||||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||||
@@ -342,6 +442,10 @@ func (a *App) ListBeeServices() ([]string, error) {
|
|||||||
return a.services.ListBeeServices()
|
return a.services.ListBeeServices()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceState(name string) string {
|
||||||
|
return a.services.ServiceState(name)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) ServiceStatus(name string) (string, error) {
|
func (a *App) ServiceStatus(name string) (string, error) {
|
||||||
return a.services.ServiceStatus(name)
|
return a.services.ServiceStatus(name)
|
||||||
}
|
}
|
||||||
@@ -397,15 +501,15 @@ func (a *App) AuditLogTailResult() ActionResult {
|
|||||||
return ActionResult{Title: "Audit log tail", Body: body}
|
return ActionResult{Title: "Audit log tail", Body: body}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunNvidiaAcceptancePack(baseDir)
|
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunNvidiaAcceptancePack(baseDir)
|
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||||
body := "Archive written."
|
body := "Archive written."
|
||||||
if path != "" {
|
if path != "" {
|
||||||
body = "Archive written to " + path
|
body = "Archive written to " + path
|
||||||
@@ -417,58 +521,120 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return a.sat.ListNvidiaGPUs()
|
return a.sat.ListNvidiaGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices)
|
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||||
body := "Archive written."
|
body := "Archive written."
|
||||||
if path != "" {
|
if path != "" {
|
||||||
body = "Archive written to " + path
|
body = "Archive written to " + path
|
||||||
}
|
}
|
||||||
// Include terminal chart if available (runDir = archive path without .tar.gz).
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
if path != "" {
|
|
||||||
termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt")
|
|
||||||
if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 {
|
|
||||||
body += "\n\n" + string(chart)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunMemoryAcceptancePack(baseDir)
|
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBenchmarkBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunMemoryAcceptancePack(baseDir)
|
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunCPUAcceptancePack(baseDir, durationSec)
|
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec)
|
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunStorageAcceptancePack(baseDir)
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunStorageAcceptancePack(baseDir)
|
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -480,18 +646,63 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
|||||||
return a.sat.ListAMDGPUs()
|
return a.sat.ListAMDGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) {
|
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunAMDAcceptancePack(baseDir)
|
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
path, err := a.RunAMDAcceptancePack(baseDir)
|
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
@@ -499,13 +710,29 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
|||||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
|
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||||
|
body := "Results: " + path
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
body += "\nERROR: " + err.Error()
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||||
body := formatFanStressResult(path)
|
body := formatFanStressResult(path)
|
||||||
if err != nil && err != context.Canceled {
|
if err != nil && err != context.Canceled {
|
||||||
body += "\nERROR: " + err.Error()
|
body += "\nERROR: " + err.Error()
|
||||||
}
|
}
|
||||||
return ActionResult{Title: "Fan Stress Test", Body: body}, err
|
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||||
@@ -576,6 +803,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
|||||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||||
}
|
}
|
||||||
|
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||||
|
|
||||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||||
var body strings.Builder
|
var body strings.Builder
|
||||||
@@ -610,6 +838,7 @@ func (a *App) MainBanner() string {
|
|||||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||||
|
|
||||||
var lines []string
|
var lines []string
|
||||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||||
@@ -704,6 +933,12 @@ func latestSATSummaries() []string {
|
|||||||
prefix string
|
prefix string
|
||||||
}{
|
}{
|
||||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||||
|
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||||
|
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||||
|
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||||
|
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||||
|
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||||
|
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||||
{label: "Memory SAT", prefix: "memory-"},
|
{label: "Memory SAT", prefix: "memory-"},
|
||||||
{label: "Storage SAT", prefix: "storage-"},
|
{label: "Storage SAT", prefix: "storage-"},
|
||||||
{label: "CPU SAT", prefix: "cpu-"},
|
{label: "CPU SAT", prefix: "cpu-"},
|
||||||
@@ -994,3 +1229,70 @@ func firstNonEmpty(values ...string) string {
|
|||||||
}
|
}
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||||
|
return a.installer.ListInstallDisks()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||||
|
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSATDetail(raw string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
kv := parseKeyValueSummary(raw)
|
||||||
|
|
||||||
|
if t, ok := kv["run_at_utc"]; ok {
|
||||||
|
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
var stepKeys []string
|
||||||
|
seenStep := map[string]bool{}
|
||||||
|
for _, line := range lines {
|
||||||
|
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||||
|
key := line[:idx]
|
||||||
|
if !seenStep[key] && key != "overall" {
|
||||||
|
seenStep[key] = true
|
||||||
|
stepKeys = append(stepKeys, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, key := range stepKeys {
|
||||||
|
status := kv[key+"_status"]
|
||||||
|
display := cleanSummaryKey(key)
|
||||||
|
switch status {
|
||||||
|
case "OK":
|
||||||
|
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||||
|
case "FAILED":
|
||||||
|
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||||
|
case "UNSUPPORTED":
|
||||||
|
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(&b, "? %s\n", display)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if overall, ok := kv["overall_status"]; ok {
|
||||||
|
ok2 := kv["job_ok"]
|
||||||
|
failed := kv["job_failed"]
|
||||||
|
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(b.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanSummaryKey(key string) string {
|
||||||
|
idx := strings.Index(key, "-")
|
||||||
|
if idx <= 0 {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
prefix := key[:idx]
|
||||||
|
for _, c := range prefix {
|
||||||
|
if c < '0' || c > '9' {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return key[idx+1:]
|
||||||
|
}
|
||||||
|
|||||||
@@ -43,6 +43,13 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
|
|||||||
return f.setStaticIPv4Fn(cfg)
|
return f.setStaticIPv4Fn(cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
||||||
|
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
||||||
|
func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return platform.NetworkSnapshot{}, nil
|
||||||
|
}
|
||||||
|
func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
|
||||||
|
|
||||||
type fakeServices struct {
|
type fakeServices struct {
|
||||||
serviceStatusFn func(string) (string, error)
|
serviceStatusFn func(string) (string, error)
|
||||||
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
||||||
@@ -52,6 +59,10 @@ func (f fakeServices) ListBeeServices() ([]string, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeServices) ServiceState(name string) string {
|
||||||
|
return "active"
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeServices) ServiceStatus(name string) (string, error) {
|
func (f fakeServices) ServiceStatus(name string) (string, error) {
|
||||||
return f.serviceStatusFn(name)
|
return f.serviceStatusFn(name)
|
||||||
}
|
}
|
||||||
@@ -109,21 +120,77 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runMemoryFn func(string) (string, error)
|
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runStorageFn func(string) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runCPUFn func(string, int) (string, error)
|
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||||
detectVendorFn func() string
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||||
runAMDPackFn func(string) (string, error)
|
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||||
|
runMemoryFn func(string) (string, error)
|
||||||
|
runStorageFn func(string) (string, error)
|
||||||
|
runCPUFn func(string, int) (string, error)
|
||||||
|
detectVendorFn func() string
|
||||||
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
|
runAMDPackFn func(string) (string, error)
|
||||||
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ int, _ []int) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaBenchmarkFn != nil {
|
||||||
|
return f.runNvidiaBenchmarkFn(baseDir, opts)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaTargetedStressFn != nil {
|
||||||
|
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaComputeFn != nil {
|
||||||
|
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPowerFn != nil {
|
||||||
|
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPulseFn != nil {
|
||||||
|
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaBandwidthFn != nil {
|
||||||
|
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaStressFn != nil {
|
||||||
|
return f.runNvidiaStressFn(baseDir, opts)
|
||||||
|
}
|
||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,15 +201,15 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runMemoryFn(baseDir)
|
return f.runMemoryFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
return f.runStorageFn(baseDir)
|
return f.runStorageFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||||
if f.runCPUFn != nil {
|
if f.runCPUFn != nil {
|
||||||
return f.runCPUFn(baseDir, durationSec)
|
return f.runCPUFn(baseDir, durationSec)
|
||||||
}
|
}
|
||||||
@@ -163,17 +230,43 @@ func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) {
|
func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
if f.runAMDPackFn != nil {
|
if f.runAMDPackFn != nil {
|
||||||
return f.runAMDPackFn(baseDir)
|
return f.runAMDPackFn(baseDir)
|
||||||
}
|
}
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -470,6 +563,41 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldExportDir := DefaultExportDir
|
||||||
|
DefaultExportDir = tmp
|
||||||
|
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||||
|
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.json: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("write bee-audit.log: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
a := &App{
|
||||||
|
exports: fakeExports{
|
||||||
|
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
|
||||||
|
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected export error")
|
||||||
|
}
|
||||||
|
if contains(result.Body, "exported to") {
|
||||||
|
t.Fatalf("body should not claim success:\n%s", result.Body)
|
||||||
|
}
|
||||||
|
if result.Body != "Support bundle export failed." {
|
||||||
|
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -531,13 +659,13 @@ func TestRunSATDefaultsToExportDir(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := a.RunNvidiaAcceptancePack(""); err != nil {
|
if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if _, err := a.RunMemoryAcceptancePack(""); err != nil {
|
if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if _, err := a.RunStorageAcceptancePack(""); err != nil {
|
if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -580,13 +708,50 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
|
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||||
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
|
|
||||||
|
raw := `{
|
||||||
|
"collected_at": "2026-03-15T10:00:00Z",
|
||||||
|
"hardware": {
|
||||||
|
"board": {"serial_number": "SRV123"},
|
||||||
|
"storage": [
|
||||||
|
{"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
|
||||||
|
{"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
|
||||||
|
],
|
||||||
|
"pcie_devices": [
|
||||||
|
{"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
|
||||||
|
{"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}`
|
||||||
|
|
||||||
|
got, err := ApplySATOverlay([]byte(raw))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ApplySATOverlay error: %v", err)
|
||||||
|
}
|
||||||
|
text := string(got)
|
||||||
|
if contains(text, "Virtual HDisk0") {
|
||||||
|
t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
|
||||||
|
}
|
||||||
|
if contains(text, "\"device_class\": \"Co-processor\"") {
|
||||||
|
t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
|
||||||
|
}
|
||||||
|
if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
|
||||||
|
t.Fatalf("overlaid audit should keep real devices:\n%s", text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
exportDir := filepath.Join(tmp, "export")
|
exportDir := filepath.Join(tmp, "export")
|
||||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||||
@@ -618,6 +783,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
|
|
||||||
tr := tar.NewReader(gzr)
|
tr := tar.NewReader(gzr)
|
||||||
var names []string
|
var names []string
|
||||||
|
var auditJSON string
|
||||||
for {
|
for {
|
||||||
hdr, err := tr.Next()
|
hdr, err := tr.Next()
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
@@ -627,6 +793,33 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
t.Fatalf("read tar entry: %v", err)
|
t.Fatalf("read tar entry: %v", err)
|
||||||
}
|
}
|
||||||
names = append(names, hdr.Name)
|
names = append(names, hdr.Name)
|
||||||
|
if contains(hdr.Name, "/export/bee-audit.json") {
|
||||||
|
body, err := io.ReadAll(tr)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read audit entry: %v", err)
|
||||||
|
}
|
||||||
|
auditJSON = string(body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, want := range []string{
|
||||||
|
"/system/ip-link.txt",
|
||||||
|
"/system/ip-link-stats.txt",
|
||||||
|
"/system/ethtool-info.txt",
|
||||||
|
"/system/ethtool-link.txt",
|
||||||
|
"/system/ethtool-module.txt",
|
||||||
|
"/system/mstflint-query.txt",
|
||||||
|
} {
|
||||||
|
var found bool
|
||||||
|
for _, name := range names {
|
||||||
|
if contains(name, want) {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var foundRaw bool
|
var foundRaw bool
|
||||||
@@ -641,6 +834,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if !foundRaw {
|
if !foundRaw {
|
||||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||||
}
|
}
|
||||||
|
if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
|
||||||
|
t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
|
||||||
|
}
|
||||||
|
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||||
|
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMainBanner(t *testing.T) {
|
func TestMainBanner(t *testing.T) {
|
||||||
@@ -654,6 +853,10 @@ func TestMainBanner(t *testing.T) {
|
|||||||
product := "PowerEdge R760"
|
product := "PowerEdge R760"
|
||||||
cpuModel := "Intel Xeon Gold 6430"
|
cpuModel := "Intel Xeon Gold 6430"
|
||||||
memoryType := "DDR5"
|
memoryType := "DDR5"
|
||||||
|
memorySerialA := "DIMM-A"
|
||||||
|
memorySerialB := "DIMM-B"
|
||||||
|
storageSerialA := "DISK-A"
|
||||||
|
storageSerialB := "DISK-B"
|
||||||
gpuClass := "VideoController"
|
gpuClass := "VideoController"
|
||||||
gpuModel := "NVIDIA H100"
|
gpuModel := "NVIDIA H100"
|
||||||
|
|
||||||
@@ -669,12 +872,12 @@ func TestMainBanner(t *testing.T) {
|
|||||||
{Model: &cpuModel},
|
{Model: &cpuModel},
|
||||||
},
|
},
|
||||||
Memory: []schema.HardwareMemory{
|
Memory: []schema.HardwareMemory{
|
||||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
|
||||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
|
||||||
},
|
},
|
||||||
Storage: []schema.HardwareStorage{
|
Storage: []schema.HardwareStorage{
|
||||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
|
||||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
|
||||||
},
|
},
|
||||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||||
|
|||||||
48
audit/internal/app/atomic_write.go
Normal file
48
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
)
|
||||||
|
|
||||||
|
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tmpPath := path + ".tmp"
|
||||||
|
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
success := false
|
||||||
|
defer func() {
|
||||||
|
_ = f.Close()
|
||||||
|
if !success {
|
||||||
|
_ = os.Remove(tmpPath)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if _, err := f.Write(data); err != nil {
|
||||||
|
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmpPath, path); err != nil {
|
||||||
|
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||||
|
_ = dir.Sync()
|
||||||
|
_ = dir.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
success = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("seed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("atomicWriteFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read final: %v", err)
|
||||||
|
}
|
||||||
|
if string(raw) != "new\n" {
|
||||||
|
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||||
|
a := &App{
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
return schema.RuntimeHealth{
|
||||||
|
Status: "OK",
|
||||||
|
ExportDir: exportDir,
|
||||||
|
DriverReady: true,
|
||||||
|
CUDAReady: true,
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := a.RunRuntimePreflight("file:" + path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||||
|
}
|
||||||
|
if got != path {
|
||||||
|
t.Fatalf("path=%q want %q", got, path)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read runtime file: %v", err)
|
||||||
|
}
|
||||||
|
var health schema.RuntimeHealth
|
||||||
|
if err := json.Unmarshal(raw, &health); err != nil {
|
||||||
|
t.Fatalf("json unmarshal: %v", err)
|
||||||
|
}
|
||||||
|
if health.Status != "OK" {
|
||||||
|
t.Fatalf("status=%q want OK", health.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
268
audit/internal/app/component_status_db.go
Normal file
268
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||||
|
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||||
|
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||||
|
// the component stays at the highest observed severity until explicitly reset.
|
||||||
|
type ComponentStatusDB struct {
|
||||||
|
path string
|
||||||
|
mu sync.Mutex
|
||||||
|
records map[string]*ComponentStatusRecord
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||||
|
type ComponentStatusRecord struct {
|
||||||
|
ComponentKey string `json:"component_key"`
|
||||||
|
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||||
|
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||||
|
LastChangedAt time.Time `json:"last_changed_at"`
|
||||||
|
ErrorSummary string `json:"error_summary,omitempty"`
|
||||||
|
History []ComponentStatusEntry `json:"history"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusEntry is one observation written to a component's history.
|
||||||
|
type ComponentStatusEntry struct {
|
||||||
|
At time.Time `json:"at"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||||
|
Detail string `json:"detail,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||||
|
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||||
|
db := &ComponentStatusDB{
|
||||||
|
path: path,
|
||||||
|
records: make(map[string]*ComponentStatusRecord),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(data) > 0 {
|
||||||
|
var records []ComponentStatusRecord
|
||||||
|
if err := json.Unmarshal(data, &records); err == nil {
|
||||||
|
for i := range records {
|
||||||
|
db.records[records[i].ComponentKey] = &records[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return db, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record writes one observation for the given component key.
|
||||||
|
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||||
|
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||||
|
// OK never downgrades an existing Warning or Critical status.
|
||||||
|
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||||
|
if db == nil || strings.TrimSpace(key) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
rec, exists := db.records[key]
|
||||||
|
if !exists {
|
||||||
|
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||||
|
db.records[key] = rec
|
||||||
|
}
|
||||||
|
rec.LastCheckedAt = now
|
||||||
|
|
||||||
|
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||||
|
rec.History = append(rec.History, entry)
|
||||||
|
|
||||||
|
// Status merge: OK never downgrades Warning/Critical.
|
||||||
|
newSev := componentSeverity(status)
|
||||||
|
curSev := componentSeverity(rec.Status)
|
||||||
|
if newSev > curSev {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
rec.ErrorSummary = detail
|
||||||
|
} else if rec.Status == "" {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
}
|
||||||
|
|
||||||
|
_ = db.saveLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get returns the current record for a component key.
|
||||||
|
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||||
|
if db == nil {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
r, ok := db.records[key]
|
||||||
|
if !ok {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
return *r, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// All returns a snapshot of all records.
|
||||||
|
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||||
|
if db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
out = append(out, *r)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *ComponentStatusDB) saveLocked() error {
|
||||||
|
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
records = append(records, *r)
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(records, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(db.path, data, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// componentSeverity returns a numeric severity so higher values win.
|
||||||
|
func componentSeverity(status string) int {
|
||||||
|
switch strings.TrimSpace(status) {
|
||||||
|
case "Critical":
|
||||||
|
return 3
|
||||||
|
case "Warning":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||||
|
// and writes component status records to db for the given SAT target.
|
||||||
|
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||||
|
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||||
|
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
archivePath = extractArchivePath(archivePath)
|
||||||
|
if archivePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
if overall == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
source := "sat:" + target
|
||||||
|
dbStatus := satStatusToDBStatus(overall)
|
||||||
|
|
||||||
|
// Map SAT target to component keys.
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
|
||||||
|
"amd-stress", "amd-mem", "amd-bandwidth":
|
||||||
|
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "memory", "memory-stress", "sat-stress":
|
||||||
|
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "cpu", "platform-stress":
|
||||||
|
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "storage":
|
||||||
|
// Try to record per-device if available in summary.
|
||||||
|
recordedAny := false
|
||||||
|
for key, val := range kv {
|
||||||
|
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
base := strings.TrimSuffix(key, "_status")
|
||||||
|
idx := strings.Index(base, "_")
|
||||||
|
if idx <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devName := base[:idx]
|
||||||
|
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||||
|
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||||
|
recordedAny = true
|
||||||
|
}
|
||||||
|
if !recordedAny {
|
||||||
|
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func satStatusToDBStatus(overall string) string {
|
||||||
|
switch overall {
|
||||||
|
case "OK":
|
||||||
|
return "OK"
|
||||||
|
case "FAILED":
|
||||||
|
return "Warning"
|
||||||
|
case "PARTIAL", "UNSUPPORTED":
|
||||||
|
return "Unknown"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||||
|
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||||
|
func ExtractArchivePath(s string) string {
|
||||||
|
return extractArchivePath(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||||
|
// file located in the run directory alongside archivePath.
|
||||||
|
// Returns "" if the file cannot be read.
|
||||||
|
func ReadSATOverallStatus(archivePath string) string {
|
||||||
|
if strings.TrimSpace(archivePath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractArchivePath(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if strings.HasSuffix(s, ".tar.gz") {
|
||||||
|
parts := strings.Fields(s)
|
||||||
|
if len(parts) > 0 {
|
||||||
|
return parts[len(parts)-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSATKV(raw string) map[string]string {
|
||||||
|
kv := make(map[string]string)
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||||
|
if ok {
|
||||||
|
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return kv
|
||||||
|
}
|
||||||
@@ -1,387 +0,0 @@
|
|||||||
package app
|
|
||||||
|
|
||||||
import (
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"sort"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"bee/audit/internal/schema"
|
|
||||||
)
|
|
||||||
|
|
||||||
// ComponentRow is one line in the hardware panel.
|
|
||||||
type ComponentRow struct {
|
|
||||||
Key string // "CPU", "MEM", "GPU", "DISK", "PSU"
|
|
||||||
Status string // "PASS", "FAIL", "CANCEL", "N/A"
|
|
||||||
Detail string // compact one-liner
|
|
||||||
}
|
|
||||||
|
|
||||||
// HardwarePanelData holds everything the TUI right panel needs.
|
|
||||||
type HardwarePanelData struct {
|
|
||||||
Header []string
|
|
||||||
Rows []ComponentRow
|
|
||||||
}
|
|
||||||
|
|
||||||
// LoadHardwarePanel reads the latest audit JSON and SAT summaries.
|
|
||||||
// Returns empty panel if no audit data exists yet.
|
|
||||||
func (a *App) LoadHardwarePanel() HardwarePanelData {
|
|
||||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
|
||||||
if err != nil {
|
|
||||||
return HardwarePanelData{Header: []string{"No audit data — run audit first."}}
|
|
||||||
}
|
|
||||||
var snap schema.HardwareIngestRequest
|
|
||||||
if err := json.Unmarshal(raw, &snap); err != nil {
|
|
||||||
return HardwarePanelData{Header: []string{"Audit data unreadable."}}
|
|
||||||
}
|
|
||||||
|
|
||||||
statuses := satStatuses()
|
|
||||||
|
|
||||||
var header []string
|
|
||||||
if sys := formatSystemLine(snap.Hardware.Board); sys != "" {
|
|
||||||
header = append(header, sys)
|
|
||||||
}
|
|
||||||
for _, fw := range snap.Hardware.Firmware {
|
|
||||||
if fw.DeviceName == "BIOS" && fw.Version != "" {
|
|
||||||
header = append(header, "BIOS: "+fw.Version)
|
|
||||||
}
|
|
||||||
if fw.DeviceName == "BMC" && fw.Version != "" {
|
|
||||||
header = append(header, "BMC: "+fw.Version)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ip := formatIPLine(a.network.ListInterfaces); ip != "" {
|
|
||||||
header = append(header, ip)
|
|
||||||
}
|
|
||||||
|
|
||||||
var rows []ComponentRow
|
|
||||||
|
|
||||||
if cpu := formatCPULine(snap.Hardware.CPUs); cpu != "" {
|
|
||||||
rows = append(rows, ComponentRow{
|
|
||||||
Key: "CPU",
|
|
||||||
Status: statuses["cpu"],
|
|
||||||
Detail: strings.TrimPrefix(cpu, "CPU: "),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if mem := formatMemoryLine(snap.Hardware.Memory); mem != "" {
|
|
||||||
rows = append(rows, ComponentRow{
|
|
||||||
Key: "MEM",
|
|
||||||
Status: statuses["memory"],
|
|
||||||
Detail: strings.TrimPrefix(mem, "Memory: "),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if gpu := formatGPULine(snap.Hardware.PCIeDevices); gpu != "" {
|
|
||||||
rows = append(rows, ComponentRow{
|
|
||||||
Key: "GPU",
|
|
||||||
Status: statuses["gpu"],
|
|
||||||
Detail: strings.TrimPrefix(gpu, "GPU: "),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if disk := formatStorageLine(snap.Hardware.Storage); disk != "" {
|
|
||||||
rows = append(rows, ComponentRow{
|
|
||||||
Key: "DISK",
|
|
||||||
Status: statuses["storage"],
|
|
||||||
Detail: strings.TrimPrefix(disk, "Storage: "),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
if psu := formatPSULine(snap.Hardware.PowerSupplies); psu != "" {
|
|
||||||
rows = append(rows, ComponentRow{
|
|
||||||
Key: "PSU",
|
|
||||||
Status: "N/A",
|
|
||||||
Detail: psu,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
return HardwarePanelData{Header: header, Rows: rows}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ComponentDetailResult returns detail text for a component shown in the panel.
|
|
||||||
func (a *App) ComponentDetailResult(key string) ActionResult {
|
|
||||||
switch key {
|
|
||||||
case "CPU":
|
|
||||||
return a.cpuDetailResult(false)
|
|
||||||
case "MEM":
|
|
||||||
return a.satDetailResult("memory", "memory-", "MEM detail")
|
|
||||||
case "GPU":
|
|
||||||
// Prefer whichever GPU SAT was run most recently.
|
|
||||||
nv, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-nvidia-*/summary.txt"))
|
|
||||||
am, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-amd-*/summary.txt"))
|
|
||||||
sort.Strings(nv)
|
|
||||||
sort.Strings(am)
|
|
||||||
latestNV := ""
|
|
||||||
if len(nv) > 0 {
|
|
||||||
latestNV = nv[len(nv)-1]
|
|
||||||
}
|
|
||||||
latestAM := ""
|
|
||||||
if len(am) > 0 {
|
|
||||||
latestAM = am[len(am)-1]
|
|
||||||
}
|
|
||||||
if latestAM > latestNV {
|
|
||||||
return a.satDetailResult("gpu", "gpu-amd-", "GPU detail")
|
|
||||||
}
|
|
||||||
return a.satDetailResult("gpu", "gpu-nvidia-", "GPU detail")
|
|
||||||
case "DISK":
|
|
||||||
return a.satDetailResult("storage", "storage-", "DISK detail")
|
|
||||||
case "PSU":
|
|
||||||
return a.psuDetailResult()
|
|
||||||
default:
|
|
||||||
return ActionResult{Title: key, Body: "No detail available."}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *App) cpuDetailResult(satOnly bool) ActionResult {
|
|
||||||
var b strings.Builder
|
|
||||||
|
|
||||||
// Show latest SAT summary if available.
|
|
||||||
satResult := a.satDetailResult("cpu", "cpu-", "CPU SAT")
|
|
||||||
if satResult.Body != "No test results found. Run a test first." {
|
|
||||||
fmt.Fprintln(&b, "=== Last SAT ===")
|
|
||||||
fmt.Fprintln(&b, satResult.Body)
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
}
|
|
||||||
|
|
||||||
if satOnly {
|
|
||||||
body := strings.TrimSpace(b.String())
|
|
||||||
if body == "" {
|
|
||||||
body = "No CPU SAT results found. Run a test first."
|
|
||||||
}
|
|
||||||
return ActionResult{Title: "CPU SAT", Body: body}
|
|
||||||
}
|
|
||||||
|
|
||||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
|
||||||
if err != nil {
|
|
||||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
|
||||||
}
|
|
||||||
var snap schema.HardwareIngestRequest
|
|
||||||
if err := json.Unmarshal(raw, &snap); err != nil {
|
|
||||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
|
||||||
}
|
|
||||||
if len(snap.Hardware.CPUs) == 0 {
|
|
||||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
|
||||||
}
|
|
||||||
fmt.Fprintln(&b, "=== Audit ===")
|
|
||||||
for i, cpu := range snap.Hardware.CPUs {
|
|
||||||
fmt.Fprintf(&b, "CPU %d\n", i)
|
|
||||||
if cpu.Model != nil {
|
|
||||||
fmt.Fprintf(&b, " Model: %s\n", *cpu.Model)
|
|
||||||
}
|
|
||||||
if cpu.Manufacturer != nil {
|
|
||||||
fmt.Fprintf(&b, " Vendor: %s\n", *cpu.Manufacturer)
|
|
||||||
}
|
|
||||||
if cpu.Cores != nil {
|
|
||||||
fmt.Fprintf(&b, " Cores: %d\n", *cpu.Cores)
|
|
||||||
}
|
|
||||||
if cpu.Threads != nil {
|
|
||||||
fmt.Fprintf(&b, " Threads: %d\n", *cpu.Threads)
|
|
||||||
}
|
|
||||||
if cpu.MaxFrequencyMHz != nil {
|
|
||||||
fmt.Fprintf(&b, " Max freq: %d MHz\n", *cpu.MaxFrequencyMHz)
|
|
||||||
}
|
|
||||||
if cpu.TemperatureC != nil {
|
|
||||||
fmt.Fprintf(&b, " Temp: %.1f°C\n", *cpu.TemperatureC)
|
|
||||||
}
|
|
||||||
if cpu.Throttled != nil {
|
|
||||||
fmt.Fprintf(&b, " Throttled: %v\n", *cpu.Throttled)
|
|
||||||
}
|
|
||||||
if cpu.CorrectableErrorCount != nil && *cpu.CorrectableErrorCount > 0 {
|
|
||||||
fmt.Fprintf(&b, " ECC correctable: %d\n", *cpu.CorrectableErrorCount)
|
|
||||||
}
|
|
||||||
if cpu.UncorrectableErrorCount != nil && *cpu.UncorrectableErrorCount > 0 {
|
|
||||||
fmt.Fprintf(&b, " ECC uncorrectable: %d\n", *cpu.UncorrectableErrorCount)
|
|
||||||
}
|
|
||||||
if i < len(snap.Hardware.CPUs)-1 {
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *App) satDetailResult(statusKey, prefix, title string) ActionResult {
|
|
||||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, prefix+"*/summary.txt"))
|
|
||||||
if err != nil || len(matches) == 0 {
|
|
||||||
return ActionResult{Title: title, Body: "No test results found. Run a test first."}
|
|
||||||
}
|
|
||||||
sort.Strings(matches)
|
|
||||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
|
||||||
if err != nil {
|
|
||||||
return ActionResult{Title: title, Body: "Could not read test results."}
|
|
||||||
}
|
|
||||||
return ActionResult{Title: title, Body: formatSATDetail(strings.TrimSpace(string(raw)))}
|
|
||||||
}
|
|
||||||
|
|
||||||
// formatSATDetail converts raw summary.txt key=value content to a human-readable per-step display.
|
|
||||||
func formatSATDetail(raw string) string {
|
|
||||||
var b strings.Builder
|
|
||||||
kv := parseKeyValueSummary(raw)
|
|
||||||
|
|
||||||
if t, ok := kv["run_at_utc"]; ok {
|
|
||||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect step names in order they appear in the file
|
|
||||||
lines := strings.Split(raw, "\n")
|
|
||||||
var stepKeys []string
|
|
||||||
seenStep := map[string]bool{}
|
|
||||||
for _, line := range lines {
|
|
||||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
|
||||||
key := line[:idx]
|
|
||||||
if !seenStep[key] && key != "overall" {
|
|
||||||
seenStep[key] = true
|
|
||||||
stepKeys = append(stepKeys, key)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, key := range stepKeys {
|
|
||||||
status := kv[key+"_status"]
|
|
||||||
display := cleanSummaryKey(key)
|
|
||||||
switch status {
|
|
||||||
case "OK":
|
|
||||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
|
||||||
case "FAILED":
|
|
||||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
|
||||||
case "UNSUPPORTED":
|
|
||||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
|
||||||
default:
|
|
||||||
fmt.Fprintf(&b, "? %s\n", display)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if overall, ok := kv["overall_status"]; ok {
|
|
||||||
ok2 := kv["job_ok"]
|
|
||||||
failed := kv["job_failed"]
|
|
||||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.TrimSpace(b.String())
|
|
||||||
}
|
|
||||||
|
|
||||||
// cleanSummaryKey strips the leading numeric prefix from a SAT step key.
|
|
||||||
// "1-lscpu" → "lscpu", "3-stress-ng" → "stress-ng"
|
|
||||||
func cleanSummaryKey(key string) string {
|
|
||||||
idx := strings.Index(key, "-")
|
|
||||||
if idx <= 0 {
|
|
||||||
return key
|
|
||||||
}
|
|
||||||
prefix := key[:idx]
|
|
||||||
for _, c := range prefix {
|
|
||||||
if c < '0' || c > '9' {
|
|
||||||
return key
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return key[idx+1:]
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *App) psuDetailResult() ActionResult {
|
|
||||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
|
||||||
if err != nil {
|
|
||||||
return ActionResult{Title: "PSU", Body: "No audit data."}
|
|
||||||
}
|
|
||||||
var snap schema.HardwareIngestRequest
|
|
||||||
if err := json.Unmarshal(raw, &snap); err != nil {
|
|
||||||
return ActionResult{Title: "PSU", Body: "Audit data unreadable."}
|
|
||||||
}
|
|
||||||
if len(snap.Hardware.PowerSupplies) == 0 {
|
|
||||||
return ActionResult{Title: "PSU", Body: "No PSU data in last audit."}
|
|
||||||
}
|
|
||||||
var b strings.Builder
|
|
||||||
for i, psu := range snap.Hardware.PowerSupplies {
|
|
||||||
fmt.Fprintf(&b, "PSU %d\n", i)
|
|
||||||
if psu.Model != nil {
|
|
||||||
fmt.Fprintf(&b, " Model: %s\n", *psu.Model)
|
|
||||||
}
|
|
||||||
if psu.Vendor != nil {
|
|
||||||
fmt.Fprintf(&b, " Vendor: %s\n", *psu.Vendor)
|
|
||||||
}
|
|
||||||
if psu.WattageW != nil {
|
|
||||||
fmt.Fprintf(&b, " Rated: %d W\n", *psu.WattageW)
|
|
||||||
}
|
|
||||||
if psu.InputPowerW != nil {
|
|
||||||
fmt.Fprintf(&b, " Input: %.1f W\n", *psu.InputPowerW)
|
|
||||||
}
|
|
||||||
if psu.OutputPowerW != nil {
|
|
||||||
fmt.Fprintf(&b, " Output: %.1f W\n", *psu.OutputPowerW)
|
|
||||||
}
|
|
||||||
if psu.TemperatureC != nil {
|
|
||||||
fmt.Fprintf(&b, " Temp: %.1f°C\n", *psu.TemperatureC)
|
|
||||||
}
|
|
||||||
if i < len(snap.Hardware.PowerSupplies)-1 {
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ActionResult{Title: "PSU", Body: strings.TrimSpace(b.String())}
|
|
||||||
}
|
|
||||||
|
|
||||||
// satStatuses reads the latest summary.txt for each SAT type and returns
|
|
||||||
// a map of component key ("gpu","memory","storage") → status ("PASS","FAIL","CANCEL","N/A").
|
|
||||||
func satStatuses() map[string]string {
|
|
||||||
result := map[string]string{
|
|
||||||
"gpu": "N/A",
|
|
||||||
"memory": "N/A",
|
|
||||||
"storage": "N/A",
|
|
||||||
"cpu": "N/A",
|
|
||||||
}
|
|
||||||
patterns := []struct {
|
|
||||||
key string
|
|
||||||
prefix string
|
|
||||||
}{
|
|
||||||
{"gpu", "gpu-nvidia-"},
|
|
||||||
{"gpu", "gpu-amd-"},
|
|
||||||
{"memory", "memory-"},
|
|
||||||
{"storage", "storage-"},
|
|
||||||
{"cpu", "cpu-"},
|
|
||||||
}
|
|
||||||
for _, item := range patterns {
|
|
||||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
|
||||||
if err != nil || len(matches) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
sort.Strings(matches)
|
|
||||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
values := parseKeyValueSummary(string(raw))
|
|
||||||
switch strings.ToUpper(strings.TrimSpace(values["overall_status"])) {
|
|
||||||
case "OK":
|
|
||||||
result[item.key] = "PASS"
|
|
||||||
case "FAILED":
|
|
||||||
result[item.key] = "FAIL"
|
|
||||||
case "CANCELED", "CANCELLED":
|
|
||||||
result[item.key] = "CANCEL"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
func formatPSULine(psus []schema.HardwarePowerSupply) string {
|
|
||||||
var present []schema.HardwarePowerSupply
|
|
||||||
for _, psu := range psus {
|
|
||||||
if psu.Present != nil && !*psu.Present {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
present = append(present, psu)
|
|
||||||
}
|
|
||||||
if len(present) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
firstW := 0
|
|
||||||
if present[0].WattageW != nil {
|
|
||||||
firstW = *present[0].WattageW
|
|
||||||
}
|
|
||||||
allSame := firstW > 0
|
|
||||||
for _, p := range present[1:] {
|
|
||||||
w := 0
|
|
||||||
if p.WattageW != nil {
|
|
||||||
w = *p.WattageW
|
|
||||||
}
|
|
||||||
if w != firstW {
|
|
||||||
allSame = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if allSame && firstW > 0 {
|
|
||||||
return fmt.Sprintf("%dx %dW", len(present), firstW)
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("%d PSU", len(present))
|
|
||||||
}
|
|
||||||
@@ -9,7 +9,7 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
|||||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||||
applyStorageSAT(snap.Storage, summary)
|
applyStorageSAT(snap.Storage, summary)
|
||||||
}
|
}
|
||||||
|
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||||
|
applyComponentStatusDB(snap, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
type satSummary struct {
|
type satSummary struct {
|
||||||
@@ -141,9 +143,11 @@ func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
|||||||
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||||
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||||
case "OK":
|
case "OK":
|
||||||
return "OK", label + " passed", true
|
// No error description on success — error_description is for problems only.
|
||||||
|
return "OK", "", true
|
||||||
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||||
return "Warning", label + " incomplete", true
|
// Tool couldn't run or test was incomplete — we can't assert hardware health.
|
||||||
|
return "Unknown", "", true
|
||||||
case "FAILED":
|
case "FAILED":
|
||||||
return "Critical", label + " failed", true
|
return "Critical", label + " failed", true
|
||||||
default:
|
default:
|
||||||
@@ -180,6 +184,8 @@ func statusSeverity(status string) int {
|
|||||||
return 2
|
return 2
|
||||||
case "OK":
|
case "OK":
|
||||||
return 1
|
return 1
|
||||||
|
case "Unknown":
|
||||||
|
return 1 // same as OK — does not override OK from another source
|
||||||
default:
|
default:
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
@@ -202,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||||
|
if snap == nil || db == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, rec := range db.All() {
|
||||||
|
key := rec.ComponentKey
|
||||||
|
status := dbStatusToSATStatus(rec.Status)
|
||||||
|
if status == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
detail := rec.ErrorSummary
|
||||||
|
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(key, "pcie:"):
|
||||||
|
bdf := strings.TrimPrefix(key, "pcie:")
|
||||||
|
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||||
|
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||||
|
if sanitizeBDFForLookup(bdf) == "" {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
normalized := sanitizeBDFForLookup(bdf)
|
||||||
|
for i := range snap.PCIeDevices {
|
||||||
|
if snap.PCIeDevices[i].BDF == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||||
|
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "storage:"):
|
||||||
|
devName := strings.TrimPrefix(key, "storage:")
|
||||||
|
if devName == "all" {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||||
|
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "memory:"):
|
||||||
|
for i := range snap.Memory {
|
||||||
|
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "cpu:"):
|
||||||
|
for i := range snap.CPUs {
|
||||||
|
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||||
|
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||||
|
func dbStatusToSATStatus(s string) string {
|
||||||
|
switch strings.TrimSpace(s) {
|
||||||
|
case "OK", "Warning", "Critical", "Unknown":
|
||||||
|
return s
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||||
|
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||||
|
func sanitizeBDFForLookup(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
bdf = "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
func ptrString(v *string) string {
|
func ptrString(v *string) string {
|
||||||
if v == nil {
|
if v == nil {
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
|||||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-network.service",
|
"bee-network.service",
|
||||||
"bee-nvidia.service",
|
"bee-nvidia.service",
|
||||||
"bee-preflight.service",
|
"bee-preflight.service",
|
||||||
|
"bee-selfheal.service",
|
||||||
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -27,15 +29,118 @@ var supportBundleCommands = []struct {
|
|||||||
cmd []string
|
cmd []string
|
||||||
}{
|
}{
|
||||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||||
|
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||||
|
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||||
|
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||||
|
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||||
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||||
|
for d in /sys/bus/pci/devices/*/; do
|
||||||
|
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||||
|
[ "$vendor" = "0x10de" ] || continue
|
||||||
|
dev=$(basename "$d")
|
||||||
|
echo "=== $dev ==="
|
||||||
|
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||||
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
|
done
|
||||||
|
done
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool -i "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool -m "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v mstflint >/dev/null 2>&1; then
|
||||||
|
echo "mstflint not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/bus/pci/devices/*; do
|
||||||
|
[ -e "$path/vendor" ] || continue
|
||||||
|
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||||
|
[ "$vendor" = "0x15b3" ] || continue
|
||||||
|
bdf=$(basename "$path")
|
||||||
|
found=1
|
||||||
|
echo "=== $bdf ==="
|
||||||
|
mstflint -d "$bdf" q 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no Mellanox/NVIDIA networking devices found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var supportBundleOptionalFiles = []struct {
|
||||||
|
name string
|
||||||
|
src string
|
||||||
|
}{
|
||||||
|
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||||
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
|
}
|
||||||
|
|
||||||
|
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||||
|
|
||||||
func BuildSupportBundle(exportDir string) (string, error) {
|
func BuildSupportBundle(exportDir string) (string, error) {
|
||||||
exportDir = strings.TrimSpace(exportDir)
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
if exportDir == "" {
|
if exportDir == "" {
|
||||||
@@ -75,6 +180,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for _, item := range supportBundleOptionalFiles {
|
||||||
|
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||||
|
}
|
||||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -86,34 +194,64 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return archivePath, nil
|
return archivePath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LatestSupportBundlePath() (string, error) {
|
||||||
|
return latestSupportBundlePath(os.TempDir())
|
||||||
|
}
|
||||||
|
|
||||||
func cleanupOldSupportBundles(dir string) error {
|
func cleanupOldSupportBundles(dir string) error {
|
||||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
type entry struct {
|
entries := supportBundleEntries(matches)
|
||||||
path string
|
for path, mod := range entries {
|
||||||
mod time.Time
|
if time.Since(mod) > 24*time.Hour {
|
||||||
|
_ = os.Remove(path)
|
||||||
|
delete(entries, path)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
list := make([]entry, 0, len(matches))
|
ordered := orderSupportBundles(entries)
|
||||||
|
if len(ordered) > 3 {
|
||||||
|
for _, old := range ordered[3:] {
|
||||||
|
_ = os.Remove(old)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func latestSupportBundlePath(dir string) (string, error) {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||||
|
if len(ordered) == 0 {
|
||||||
|
return "", os.ErrNotExist
|
||||||
|
}
|
||||||
|
return ordered[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||||
|
entries := make(map[string]time.Time, len(matches))
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
info, err := os.Stat(match)
|
info, err := os.Stat(match)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
entries[match] = info.ModTime()
|
||||||
_ = os.Remove(match)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
|
||||||
}
|
}
|
||||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
return entries
|
||||||
if len(list) > 3 {
|
}
|
||||||
for _, old := range list[3:] {
|
|
||||||
_ = os.Remove(old.path)
|
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||||
}
|
ordered := make([]string, 0, len(entries))
|
||||||
|
for path := range entries {
|
||||||
|
ordered = append(ordered, path)
|
||||||
}
|
}
|
||||||
return nil
|
sort.Slice(ordered, func(i, j int) bool {
|
||||||
|
return entries[ordered[i]].After(entries[ordered[j]])
|
||||||
|
})
|
||||||
|
return ordered
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeJournalDump(dst string) error {
|
func writeJournalDump(dst string) error {
|
||||||
@@ -152,6 +290,24 @@ func writeCommandOutput(dst string, cmd []string) error {
|
|||||||
return os.WriteFile(dst, raw, 0644)
|
return os.WriteFile(dst, raw, 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func copyOptionalFile(src, dst string) error {
|
||||||
|
in, err := os.Open(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer in.Close()
|
||||||
|
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
out, err := os.Create(dst)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer out.Close()
|
||||||
|
_, err = io.Copy(out, in)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -215,7 +371,7 @@ func copyDirContents(srcDir, dstDir string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||||
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||||
if cleanRel == "" {
|
if cleanRel == "" {
|
||||||
return true
|
return true
|
||||||
@@ -227,7 +383,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
})
|
}); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeSupportBundleAuditJSON(path string) error {
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
normalized, err := ApplySATOverlay(data)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, normalized, 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||||
|
|||||||
@@ -1,10 +1,18 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import "bee/audit/internal/schema"
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||||
|
finalizeSnapshot(snap, collectedAt)
|
||||||
|
}
|
||||||
|
|
||||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||||
snap.Memory = filterMemory(snap.Memory)
|
snap.Memory = filterMemory(snap.Memory)
|
||||||
snap.Storage = filterStorage(snap.Storage)
|
snap.Storage = filterStorage(snap.Storage)
|
||||||
|
snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
|
||||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||||
|
|
||||||
setComponentStatusMetadata(snap, collectedAt)
|
setComponentStatusMetadata(snap, collectedAt)
|
||||||
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
|||||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
out = append(out, disk)
|
out = append(out, disk)
|
||||||
}
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
out := make([]schema.HardwarePCIeDevice, 0, len(devs))
|
||||||
|
for _, dev := range devs {
|
||||||
|
if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, dev)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||||
for _, psu := range psus {
|
for _, psu := range psus {
|
||||||
|
|||||||
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
|||||||
present := true
|
present := true
|
||||||
status := statusOK
|
status := statusOK
|
||||||
serial := "SN-1"
|
serial := "SN-1"
|
||||||
|
virtualModel := "Virtual HDisk1"
|
||||||
|
realModel := "PASCARI"
|
||||||
|
coProcessorClass := "Co-processor"
|
||||||
|
gpuClass := "VideoController"
|
||||||
|
|
||||||
snap := schema.HardwareSnapshot{
|
snap := schema.HardwareSnapshot{
|
||||||
Memory: []schema.HardwareMemory{
|
Memory: []schema.HardwareMemory{
|
||||||
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
|||||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
},
|
},
|
||||||
Storage: []schema.HardwareStorage{
|
Storage: []schema.HardwareStorage{
|
||||||
|
{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
},
|
},
|
||||||
|
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||||
|
{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
PowerSupplies: []schema.HardwarePowerSupply{
|
PowerSupplies: []schema.HardwarePowerSupply{
|
||||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
|||||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||||
}
|
}
|
||||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||||
}
|
}
|
||||||
|
if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
|
||||||
|
t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
|
||||||
|
}
|
||||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,18 +2,21 @@ package collector
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
|
"context"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
const mellanoxVendorID = 0x15b3
|
const mellanoxVendorID = 0x15b3
|
||||||
|
const nicProbeTimeout = 2 * time.Second
|
||||||
|
|
||||||
var (
|
var (
|
||||||
mstflintQuery = func(bdf string) (string, error) {
|
mstflintQuery = func(bdf string) (string, error) {
|
||||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -21,7 +24,7 @@ var (
|
|||||||
}
|
}
|
||||||
|
|
||||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -29,6 +32,14 @@ var (
|
|||||||
}
|
}
|
||||||
|
|
||||||
netIfacesByBDF = listNetIfacesByBDF
|
netIfacesByBDF = listNetIfacesByBDF
|
||||||
|
readNetCarrierFile = func(iface string) (string, error) {
|
||||||
|
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(raw)), nil
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||||
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
|
|||||||
}
|
}
|
||||||
return ifaces
|
return ifaces
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||||
|
defer cancel()
|
||||||
|
return exec.CommandContext(ctx, name, args...).Output()
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceHasCarrier(iface string) bool {
|
||||||
|
raw, err := readNetCarrierFile(iface)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(raw) == "1"
|
||||||
|
}
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ import (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if interfaceHasCarrier(iface) {
|
||||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
enriched++
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
continue
|
enriched++
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
origReadMAC := readNetAddressFile
|
origReadMAC := readNetAddressFile
|
||||||
origEth := ethtoolInfoQuery
|
origEth := ethtoolInfoQuery
|
||||||
origModule := ethtoolModuleQuery
|
origModule := ethtoolModuleQuery
|
||||||
|
origCarrier := readNetCarrierFile
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
queryPCILSPCIDetail = origDetail
|
queryPCILSPCIDetail = origDetail
|
||||||
readPCIVPDFile = origVPD
|
readPCIVPDFile = origVPD
|
||||||
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
readNetAddressFile = origReadMAC
|
readNetAddressFile = origReadMAC
|
||||||
ethtoolInfoQuery = origEth
|
ethtoolInfoQuery = origEth
|
||||||
ethtoolModuleQuery = origModule
|
ethtoolModuleQuery = origModule
|
||||||
|
readNetCarrierFile = origCarrier
|
||||||
})
|
})
|
||||||
|
|
||||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||||
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return "aa:bb:cc:dd:ee:ff", nil
|
return "aa:bb:cc:dd:ee:ff", nil
|
||||||
}
|
}
|
||||||
|
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||||
|
|
||||||
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||||
|
origIfaces := netIfacesByBDF
|
||||||
|
origReadMAC := readNetAddressFile
|
||||||
|
origEth := ethtoolInfoQuery
|
||||||
|
origModule := ethtoolModuleQuery
|
||||||
|
origCarrier := readNetCarrierFile
|
||||||
|
t.Cleanup(func() {
|
||||||
|
netIfacesByBDF = origIfaces
|
||||||
|
readNetAddressFile = origReadMAC
|
||||||
|
ethtoolInfoQuery = origEth
|
||||||
|
ethtoolModuleQuery = origModule
|
||||||
|
readNetCarrierFile = origCarrier
|
||||||
|
})
|
||||||
|
|
||||||
|
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||||
|
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||||
|
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||||
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
|
ethtoolModuleQuery = func(string) (string, error) {
|
||||||
|
t.Fatal("ethtool -m should not be called without carrier")
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "EthernetController"
|
||||||
|
bdf := "0000:18:00.0"
|
||||||
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
DeviceClass: &class,
|
||||||
|
BDF: &bdf,
|
||||||
|
}}
|
||||||
|
|
||||||
|
out := enrichPCIeWithNICTelemetry(devs)
|
||||||
|
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||||
|
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDBMValue(t *testing.T) {
|
func TestDBMValue(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
in string
|
in string
|
||||||
|
|||||||
@@ -13,14 +13,18 @@ import (
|
|||||||
const nvidiaVendorID = 0x10de
|
const nvidiaVendorID = 0x10de
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
BDF string
|
BDF string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
TemperatureC *float64
|
TemperatureC *float64
|
||||||
PowerW *float64
|
PowerW *float64
|
||||||
ECCUncorrected *int64
|
ECCUncorrected *int64
|
||||||
ECCCorrected *int64
|
ECCCorrected *int64
|
||||||
HWSlowdown *bool
|
HWSlowdown *bool
|
||||||
|
PCIeLinkGenCurrent *int
|
||||||
|
PCIeLinkGenMax *int
|
||||||
|
PCIeLinkWidthCur *int
|
||||||
|
PCIeLinkWidthMax *int
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||||
out, err := exec.Command(
|
out, err := exec.Command(
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
).Output()
|
).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
if len(rec) == 0 {
|
if len(rec) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(rec) < 9 {
|
if len(rec) < 13 {
|
||||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||||
}
|
}
|
||||||
|
|
||||||
bdf := normalizePCIeBDF(rec[1])
|
bdf := normalizePCIeBDF(rec[1])
|
||||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Serial: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
VBIOS: strings.TrimSpace(rec[3]),
|
||||||
TemperatureC: parseMaybeFloat(rec[4]),
|
TemperatureC: parseMaybeFloat(rec[4]),
|
||||||
PowerW: parseMaybeFloat(rec[5]),
|
PowerW: parseMaybeFloat(rec[5]),
|
||||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||||
HWSlowdown: parseMaybeBool(rec[8]),
|
HWSlowdown: parseMaybeBool(rec[8]),
|
||||||
|
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||||
|
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||||
|
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||||
|
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||||
}
|
}
|
||||||
result[bdf] = info
|
result[bdf] = info
|
||||||
}
|
}
|
||||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
|||||||
return &n
|
return &n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseMaybeInt(v string) *int {
|
||||||
|
v = strings.TrimSpace(v)
|
||||||
|
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(v)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &n
|
||||||
|
}
|
||||||
|
|
||||||
|
func pcieLinkGenLabel(gen int) string {
|
||||||
|
return fmt.Sprintf("Gen%d", gen)
|
||||||
|
}
|
||||||
|
|
||||||
func parseMaybeBool(v string) *bool {
|
func parseMaybeBool(v string) *bool {
|
||||||
v = strings.TrimSpace(strings.ToLower(v))
|
v = strings.TrimSpace(strings.ToLower(v))
|
||||||
switch v {
|
switch v {
|
||||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
|||||||
if info.HWSlowdown != nil {
|
if info.HWSlowdown != nil {
|
||||||
dev.HWSlowdown = info.HWSlowdown
|
dev.HWSlowdown = info.HWSlowdown
|
||||||
}
|
}
|
||||||
|
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||||
|
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||||
|
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||||
|
// knows the negotiated speed regardless of the current power state.
|
||||||
|
if info.PCIeLinkGenCurrent != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||||
|
dev.LinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkGenMax != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||||
|
dev.MaxLinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthCur != nil {
|
||||||
|
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthMax != nil {
|
||||||
|
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse failed: %v", err)
|
t.Fatalf("parse failed: %v", err)
|
||||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
|||||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||||
}
|
}
|
||||||
|
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||||
|
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||||
|
}
|
||||||
|
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||||
|
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNormalizePCIeBDF(t *testing.T) {
|
func TestNormalizePCIeBDF(t *testing.T) {
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
|||||||
"host bridge",
|
"host bridge",
|
||||||
"isa bridge",
|
"isa bridge",
|
||||||
"pci bridge",
|
"pci bridge",
|
||||||
|
"co-processor",
|
||||||
"performance counter",
|
"performance counter",
|
||||||
"performance counters",
|
"performance counters",
|
||||||
"ram memory",
|
"ram memory",
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
|||||||
{name: "audio", class: "Audio device", want: false},
|
{name: "audio", class: "Audio device", want: false},
|
||||||
{name: "host bridge", class: "Host bridge", want: false},
|
{name: "host bridge", class: "Host bridge", want: false},
|
||||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||||
|
{name: "co-processor", class: "Co-processor", want: false},
|
||||||
{name: "smbus", class: "SMBus", want: false},
|
{name: "smbus", class: "SMBus", want: false},
|
||||||
{name: "perf", class: "Performance counters", want: false},
|
{name: "perf", class: "Performance counters", want: false},
|
||||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||||
@@ -76,6 +77,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseLspci_filtersCoProcessors(t *testing.T) {
|
||||||
|
input := "" +
|
||||||
|
"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
|
||||||
|
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||||
|
|
||||||
|
devs := parseLspci(input)
|
||||||
|
if len(devs) != 1 {
|
||||||
|
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||||
|
}
|
||||||
|
if devs[0].Model == nil || *devs[0].Model != "H100" {
|
||||||
|
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||||
|
|
||||||
|
|||||||
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
|
|||||||
if dev.Type != "disk" {
|
if dev.Type != "disk" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if isVirtualBMCDisk(dev) {
|
||||||
|
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||||
|
continue
|
||||||
|
}
|
||||||
disks = append(disks, dev)
|
disks = append(disks, dev)
|
||||||
}
|
}
|
||||||
return disks
|
return disks
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||||
|
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||||
|
// These have zero reported size, a generic fake serial, and a model name that
|
||||||
|
// starts with "Virtual HDisk".
|
||||||
|
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||||
|
return isVirtualHDiskModel(dev.Model)
|
||||||
|
}
|
||||||
|
|
||||||
|
func isVirtualHDiskModel(model string) bool {
|
||||||
|
model = strings.ToLower(strings.TrimSpace(model))
|
||||||
|
return strings.HasPrefix(model, "virtual hdisk")
|
||||||
|
}
|
||||||
|
|
||||||
func lsblkDevices() []lsblkDevice {
|
func lsblkDevices() []lsblkDevice {
|
||||||
out, err := exec.Command("lsblk", "-J", "-d",
|
out, err := exec.Command("lsblk", "-J", "-d",
|
||||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||||
|
|||||||
1006
audit/internal/platform/benchmark.go
Normal file
1006
audit/internal/platform/benchmark.go
Normal file
File diff suppressed because it is too large
Load Diff
141
audit/internal/platform/benchmark_report.go
Normal file
141
audit/internal/platform/benchmark_report.go
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
||||||
|
fmt.Fprintf(&b, "===========================\n\n")
|
||||||
|
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
|
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
|
||||||
|
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
|
||||||
|
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
|
||||||
|
|
||||||
|
if len(result.Findings) > 0 {
|
||||||
|
fmt.Fprintf(&b, "Executive Summary\n")
|
||||||
|
fmt.Fprintf(&b, "-----------------\n")
|
||||||
|
for _, finding := range result.Findings {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", finding)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(result.Warnings) > 0 {
|
||||||
|
fmt.Fprintf(&b, "Warnings\n")
|
||||||
|
fmt.Fprintf(&b, "--------\n")
|
||||||
|
for _, warning := range result.Warnings {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", warning)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "Per GPU Scorecard\n")
|
||||||
|
fmt.Fprintf(&b, "-----------------\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
|
||||||
|
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
||||||
|
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
||||||
|
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
||||||
|
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
||||||
|
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
||||||
|
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
||||||
|
if gpu.Scores.InterconnectScore > 0 {
|
||||||
|
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
|
||||||
|
}
|
||||||
|
if len(gpu.DegradationReasons) > 0 {
|
||||||
|
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
|
||||||
|
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
|
||||||
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
|
fmt.Fprintf(&b, " Precision results:\n")
|
||||||
|
for _, precision := range gpu.PrecisionResults {
|
||||||
|
if precision.Supported {
|
||||||
|
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
|
||||||
|
gpu.Throttle.SWPowerCapUS,
|
||||||
|
gpu.Throttle.SWThermalSlowdownUS,
|
||||||
|
gpu.Throttle.SyncBoostUS,
|
||||||
|
gpu.Throttle.HWThermalSlowdownUS,
|
||||||
|
gpu.Throttle.HWPowerBrakeSlowdownUS,
|
||||||
|
)
|
||||||
|
if len(gpu.Notes) > 0 {
|
||||||
|
fmt.Fprintf(&b, " Notes:\n")
|
||||||
|
for _, note := range gpu.Notes {
|
||||||
|
fmt.Fprintf(&b, " - %s\n", note)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
fmt.Fprintf(&b, "Interconnect\n")
|
||||||
|
fmt.Fprintf(&b, "------------\n")
|
||||||
|
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
|
||||||
|
if result.Interconnect.Supported {
|
||||||
|
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
|
||||||
|
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||||
|
}
|
||||||
|
for _, note := range result.Interconnect.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "Methodology\n")
|
||||||
|
fmt.Fprintf(&b, "-----------\n")
|
||||||
|
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
|
||||||
|
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||||
|
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "Raw Files\n")
|
||||||
|
fmt.Fprintf(&b, "---------\n")
|
||||||
|
fmt.Fprintf(&b, "- result.json\n")
|
||||||
|
fmt.Fprintf(&b, "- report.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- summary.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- verbose.log\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||||
|
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||||||
|
var best float64
|
||||||
|
for i, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||||||
|
if i == 0 || gpu.Scores.CompositeScore > best {
|
||||||
|
best = gpu.Scores.CompositeScore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||||||
|
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
147
audit/internal/platform/benchmark_test.go
Normal file
147
audit/internal/platform/benchmark_test.go
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
profile string
|
||||||
|
want benchmarkProfileSpec
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "default",
|
||||||
|
profile: "",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "stability",
|
||||||
|
profile: "stability",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight",
|
||||||
|
profile: "overnight",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got := resolveBenchmarkProfile(tc.profile)
|
||||||
|
if got != tc.want {
|
||||||
|
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
|
||||||
|
Profile: "stability",
|
||||||
|
RunNCCL: false,
|
||||||
|
})
|
||||||
|
if opts.Profile != NvidiaBenchmarkProfileStability {
|
||||||
|
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
|
||||||
|
}
|
||||||
|
if opts.RunNCCL {
|
||||||
|
t.Fatalf("RunNCCL should stay false when explicitly disabled")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := strings.Join([]string{
|
||||||
|
"loader=bee-gpu-burn",
|
||||||
|
"[gpu 0] device=NVIDIA H100",
|
||||||
|
"[gpu 0] compute_capability=9.0",
|
||||||
|
"[gpu 0] backend=cublasLt",
|
||||||
|
"[gpu 0] duration_s=10",
|
||||||
|
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] fp16_tensor_iterations=200",
|
||||||
|
"[gpu 0] fp8_e4m3_iterations=50",
|
||||||
|
"[gpu 0] status=OK",
|
||||||
|
}, "\n")
|
||||||
|
|
||||||
|
got := parseBenchmarkBurnLog(raw)
|
||||||
|
if got.Backend != "cublasLt" {
|
||||||
|
t.Fatalf("backend=%q want cublasLt", got.Backend)
|
||||||
|
}
|
||||||
|
if got.ComputeCapability != "9.0" {
|
||||||
|
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||||
|
}
|
||||||
|
if len(got.Profiles) != 2 {
|
||||||
|
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
||||||
|
}
|
||||||
|
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||||
|
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||||
|
}
|
||||||
|
if got.Profiles[1].Category != "fp8" {
|
||||||
|
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
result := NvidiaBenchmarkResult{
|
||||||
|
BenchmarkVersion: benchmarkVersion,
|
||||||
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
|
OverallStatus: "PARTIAL",
|
||||||
|
SelectedGPUIndices: []int{0},
|
||||||
|
Normalization: BenchmarkNormalization{
|
||||||
|
Status: "partial",
|
||||||
|
},
|
||||||
|
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
|
||||||
|
GPUs: []BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100",
|
||||||
|
Status: "OK",
|
||||||
|
Steady: BenchmarkTelemetrySummary{
|
||||||
|
AvgPowerW: 680,
|
||||||
|
AvgTempC: 79,
|
||||||
|
AvgGraphicsClockMHz: 1725,
|
||||||
|
P95PowerW: 700,
|
||||||
|
P95TempC: 82,
|
||||||
|
P95GraphicsClockMHz: 1800,
|
||||||
|
},
|
||||||
|
Scores: BenchmarkScorecard{
|
||||||
|
ComputeScore: 1200,
|
||||||
|
PowerSustainScore: 96,
|
||||||
|
ThermalSustainScore: 88,
|
||||||
|
StabilityScore: 92,
|
||||||
|
CompositeScore: 1176,
|
||||||
|
},
|
||||||
|
PrecisionResults: []BenchmarkPrecisionResult{
|
||||||
|
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
|
||||||
|
},
|
||||||
|
Throttle: BenchmarkThrottleCounters{
|
||||||
|
SWPowerCapUS: 1000000,
|
||||||
|
},
|
||||||
|
DegradationReasons: []string{"power_capped"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
report := renderBenchmarkReport(result)
|
||||||
|
for _, needle := range []string{
|
||||||
|
"Executive Summary",
|
||||||
|
"GPU 0 spent measurable time under SW power cap.",
|
||||||
|
"Composite score: 1176.00",
|
||||||
|
"fp16_tensor: 700.00 TOPS",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(report, needle) {
|
||||||
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
132
audit/internal/platform/benchmark_types.go
Normal file
132
audit/internal/platform/benchmark_types.go
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
const (
|
||||||
|
NvidiaBenchmarkProfileStandard = "standard"
|
||||||
|
NvidiaBenchmarkProfileStability = "stability"
|
||||||
|
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaBenchmarkOptions struct {
|
||||||
|
Profile string
|
||||||
|
SizeMB int
|
||||||
|
GPUIndices []int
|
||||||
|
ExcludeGPUIndices []int
|
||||||
|
RunNCCL bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaBenchmarkResult struct {
|
||||||
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
OverallStatus string `json:"overall_status"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkNormalization struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkNormalizationGPU struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
PersistenceMode string `json:"persistence_mode,omitempty"`
|
||||||
|
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
|
||||||
|
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
|
||||||
|
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
|
||||||
|
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkGPUResult struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
UUID string `json:"uuid,omitempty"`
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
BusID string `json:"bus_id,omitempty"`
|
||||||
|
VBIOS string `json:"vbios,omitempty"`
|
||||||
|
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||||
|
Backend string `json:"backend,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||||
|
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||||
|
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||||
|
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||||
|
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||||
|
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||||
|
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||||
|
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||||
|
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||||
|
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||||
|
Scores BenchmarkScorecard `json:"scores"`
|
||||||
|
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkTelemetrySummary struct {
|
||||||
|
DurationSec float64 `json:"duration_sec"`
|
||||||
|
Samples int `json:"samples"`
|
||||||
|
AvgTempC float64 `json:"avg_temp_c"`
|
||||||
|
P95TempC float64 `json:"p95_temp_c"`
|
||||||
|
AvgPowerW float64 `json:"avg_power_w"`
|
||||||
|
P95PowerW float64 `json:"p95_power_w"`
|
||||||
|
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
|
||||||
|
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
|
||||||
|
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
|
||||||
|
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
|
||||||
|
AvgUsagePct float64 `json:"avg_usage_pct"`
|
||||||
|
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
|
||||||
|
ClockCVPct float64 `json:"clock_cv_pct"`
|
||||||
|
PowerCVPct float64 `json:"power_cv_pct"`
|
||||||
|
TempCVPct float64 `json:"temp_cv_pct"`
|
||||||
|
ClockDriftPct float64 `json:"clock_drift_pct"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkThrottleCounters struct {
|
||||||
|
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
|
||||||
|
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
|
||||||
|
SyncBoostUS uint64 `json:"sync_boost_us"`
|
||||||
|
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
|
||||||
|
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPrecisionResult struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Category string `json:"category"`
|
||||||
|
Supported bool `json:"supported"`
|
||||||
|
Lanes int `json:"lanes,omitempty"`
|
||||||
|
M uint64 `json:"m,omitempty"`
|
||||||
|
N uint64 `json:"n,omitempty"`
|
||||||
|
K uint64 `json:"k,omitempty"`
|
||||||
|
Iterations uint64 `json:"iterations,omitempty"`
|
||||||
|
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||||
|
Notes string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkScorecard struct {
|
||||||
|
ComputeScore float64 `json:"compute_score"`
|
||||||
|
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||||
|
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||||
|
StabilityScore float64 `json:"stability_score"`
|
||||||
|
InterconnectScore float64 `json:"interconnect_score"`
|
||||||
|
CompositeScore float64 `json:"composite_score"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkInterconnectResult struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Attempted bool `json:"attempted"`
|
||||||
|
Supported bool `json:"supported"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
|
||||||
|
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
|
||||||
|
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
|
||||||
|
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
|
||||||
|
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "regexp"
|
||||||
|
|
||||||
|
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||||
|
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||||
|
type ErrorPattern struct {
|
||||||
|
// Name is a short machine-readable label for logging and deduplication.
|
||||||
|
Name string
|
||||||
|
// Re is the compiled regular expression matched against a single kmsg line.
|
||||||
|
Re *regexp.Regexp
|
||||||
|
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||||
|
Category string
|
||||||
|
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||||
|
Severity string
|
||||||
|
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||||
|
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||||
|
BDFGroup int
|
||||||
|
// DevGroup is the capture group index (1-based) that contains a device name
|
||||||
|
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||||
|
DevGroup int
|
||||||
|
}
|
||||||
|
|
||||||
|
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||||
|
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||||
|
var HardwareErrorPatterns = []ErrorPattern{
|
||||||
|
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "nvidia-rminitadapter",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-msi-fail",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-aer",
|
||||||
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-xid",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "pcie-aer",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-uncorrectable",
|
||||||
|
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-link-down",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "blk-io-error",
|
||||||
|
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-timeout",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "scsi-failed",
|
||||||
|
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-reset",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "mce-hardware-error",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "mce-corrected",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "edac-ue",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "edac-ce",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustPat(s string) *regexp.Regexp {
|
||||||
|
return regexp.MustCompile(s)
|
||||||
|
}
|
||||||
@@ -11,8 +11,48 @@ import (
|
|||||||
|
|
||||||
var exportExecCommand = exec.Command
|
var exportExecCommand = exec.Command
|
||||||
|
|
||||||
|
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
|
||||||
|
msg := strings.TrimSpace(raw)
|
||||||
|
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||||
|
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||||
|
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
func removableTargetReadOnly(fields map[string]string) bool {
|
||||||
|
if fields["RO"] == "1" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
|
||||||
|
case "iso9660", "squashfs":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureWritableMountpoint(mountpoint string) error {
|
||||||
|
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||||
|
}
|
||||||
|
name := probe.Name()
|
||||||
|
if closeErr := probe.Close(); closeErr != nil {
|
||||||
|
_ = os.Remove(name)
|
||||||
|
return closeErr
|
||||||
|
}
|
||||||
|
if err := os.Remove(name); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||||
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -36,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !removable || fields["FSTYPE"] == "" {
|
if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -72,7 +112,7 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
|
|||||||
}
|
}
|
||||||
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||||
_ = os.Remove(mountpoint)
|
_ = os.Remove(mountpoint)
|
||||||
return string(raw), err
|
return "", formatMountTargetError(target, string(raw), err)
|
||||||
}
|
}
|
||||||
mountedHere = true
|
mountedHere = true
|
||||||
mounted = true
|
mounted = true
|
||||||
@@ -95,6 +135,10 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst str
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if err := ensureWritableMountpoint(mountpoint); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
filename := filepath.Base(src)
|
filename := filepath.Base(src)
|
||||||
dst = filepath.Join(mountpoint, filename)
|
dst = filepath.Join(mountpoint, filename)
|
||||||
data, err := os.ReadFile(src)
|
data, err := os.ReadFile(src)
|
||||||
|
|||||||
@@ -4,12 +4,11 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||||
mountpoint := filepath.Join(tmp, "mnt")
|
mountpoint := filepath.Join(tmp, "mnt")
|
||||||
@@ -54,3 +53,60 @@ func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
|||||||
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||||
|
mountpoint := filepath.Join(tmp, "mnt")
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir mountpoint: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||||
|
t.Fatalf("write src: %v", err)
|
||||||
|
}
|
||||||
|
if err := os.Chmod(mountpoint, 0555); err != nil {
|
||||||
|
t.Fatalf("chmod mountpoint: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
oldExec := exportExecCommand
|
||||||
|
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
return exec.Command("sh", "-c", "exit 0")
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||||
|
|
||||||
|
s := &System{}
|
||||||
|
_, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
Mountpoint: mountpoint,
|
||||||
|
})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error for non-writable mountpoint")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "target filesystem is not writable") {
|
||||||
|
t.Fatalf("err=%q want writable message", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
|
||||||
|
oldExec := exportExecCommand
|
||||||
|
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
|
||||||
|
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
|
||||||
|
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
|
||||||
|
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||||
|
|
||||||
|
s := &System{}
|
||||||
|
targets, err := s.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ListRemovableTargets error: %v", err)
|
||||||
|
}
|
||||||
|
if len(targets) != 1 {
|
||||||
|
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
|
||||||
|
}
|
||||||
|
if got := targets[0].Device; got != "/dev/sdb1" {
|
||||||
|
t.Fatalf("device=%q want /dev/sdb1", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -13,18 +13,20 @@ import (
|
|||||||
|
|
||||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||||
type GPUMetricRow struct {
|
type GPUMetricRow struct {
|
||||||
ElapsedSec float64
|
ElapsedSec float64 `json:"elapsed_sec"`
|
||||||
GPUIndex int
|
GPUIndex int `json:"index"`
|
||||||
TempC float64
|
TempC float64 `json:"temp_c"`
|
||||||
UsagePct float64
|
UsagePct float64 `json:"usage_pct"`
|
||||||
PowerW float64
|
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||||
ClockMHz float64
|
PowerW float64 `json:"power_w"`
|
||||||
|
ClockMHz float64 `json:"clock_mhz"`
|
||||||
|
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
args := []string{
|
args := []string{
|
||||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics",
|
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
}
|
}
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
@@ -45,16 +47,18 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
parts := strings.Split(line, ", ")
|
parts := strings.Split(line, ", ")
|
||||||
if len(parts) < 5 {
|
if len(parts) < 7 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
rows = append(rows, GPUMetricRow{
|
rows = append(rows, GPUMetricRow{
|
||||||
GPUIndex: idx,
|
GPUIndex: idx,
|
||||||
TempC: parseGPUFloat(parts[1]),
|
TempC: parseGPUFloat(parts[1]),
|
||||||
UsagePct: parseGPUFloat(parts[2]),
|
UsagePct: parseGPUFloat(parts[2]),
|
||||||
PowerW: parseGPUFloat(parts[3]),
|
MemUsagePct: parseGPUFloat(parts[3]),
|
||||||
ClockMHz: parseGPUFloat(parts[4]),
|
PowerW: parseGPUFloat(parts[4]),
|
||||||
|
ClockMHz: parseGPUFloat(parts[5]),
|
||||||
|
MemClockMHz: parseGPUFloat(parts[6]),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return rows, nil
|
return rows, nil
|
||||||
@@ -69,13 +73,78 @@ func parseGPUFloat(s string) float64 {
|
|||||||
return v
|
return v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
|
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
|
return sampleGPUMetrics(gpuIndices)
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||||
|
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||||
|
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||||
|
if len(lines) < 2 {
|
||||||
|
return nil, fmt.Errorf("rocm-smi: insufficient output")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse header to find column indices by name.
|
||||||
|
headers := strings.Split(lines[0], ",")
|
||||||
|
colIdx := func(keywords ...string) int {
|
||||||
|
for i, h := range headers {
|
||||||
|
hl := strings.ToLower(strings.TrimSpace(h))
|
||||||
|
for _, kw := range keywords {
|
||||||
|
if strings.Contains(hl, kw) {
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
|
||||||
|
idxUse := colIdx("gpu use (%)")
|
||||||
|
idxMem := colIdx("vram%", "memory allocated")
|
||||||
|
idxPow := colIdx("average graphics package power", "power (w)")
|
||||||
|
|
||||||
|
var rows []GPUMetricRow
|
||||||
|
for _, line := range lines[1:] {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
idx := len(rows)
|
||||||
|
row := GPUMetricRow{GPUIndex: idx}
|
||||||
|
get := func(i int) float64 {
|
||||||
|
if i < 0 || i >= len(parts) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
v := strings.TrimSpace(parts[i])
|
||||||
|
if strings.EqualFold(v, "n/a") {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return parseGPUFloat(v)
|
||||||
|
}
|
||||||
|
row.TempC = get(idxTemp)
|
||||||
|
row.UsagePct = get(idxUse)
|
||||||
|
row.MemUsagePct = get(idxMem)
|
||||||
|
row.PowerW = get(idxPow)
|
||||||
|
rows = append(rows, row)
|
||||||
|
}
|
||||||
|
if len(rows) == 0 {
|
||||||
|
return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
|
||||||
|
}
|
||||||
|
return rows, nil
|
||||||
|
}
|
||||||
|
|
||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
|
||||||
for _, r := range rows {
|
for _, r := range rows {
|
||||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
|
||||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
|
||||||
}
|
}
|
||||||
return os.WriteFile(path, b.Bytes(), 0644)
|
return os.WriteFile(path, b.Bytes(), 0644)
|
||||||
}
|
}
|
||||||
@@ -130,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
const PW = plotX2 - plotX1
|
const PW = plotX2 - plotX1
|
||||||
const PH = plotY2 - plotY1
|
const PH = plotY2 - plotY1
|
||||||
// Outer axes
|
// Outer axes
|
||||||
const tempAxisX = 60 // temp axis line
|
const tempAxisX = 60 // temp axis line
|
||||||
const clockAxisX = 900 // clock axis line
|
const clockAxisX = 900 // clock axis line
|
||||||
|
|
||||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||||
@@ -327,7 +396,7 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||||
// Suitable for display in the TUI screenOutput.
|
// Used in SAT stress-test logs.
|
||||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||||
seen := make(map[int]bool)
|
seen := make(map[int]bool)
|
||||||
var order []int
|
var order []int
|
||||||
|
|||||||
269
audit/internal/platform/install.go
Normal file
269
audit/internal/platform/install.go
Normal file
@@ -0,0 +1,269 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// InstallDisk describes a candidate disk for installation.
|
||||||
|
type InstallDisk struct {
|
||||||
|
Device string // e.g. /dev/sda
|
||||||
|
Model string
|
||||||
|
Size string // human-readable, e.g. "500G"
|
||||||
|
SizeBytes int64 // raw byte count from lsblk
|
||||||
|
MountedParts []string // partition mount points currently active
|
||||||
|
}
|
||||||
|
|
||||||
|
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
||||||
|
|
||||||
|
// ListInstallDisks returns block devices suitable for installation.
|
||||||
|
// Excludes the current live boot medium but includes USB drives.
|
||||||
|
func (s *System) ListInstallDisks() ([]InstallDisk, error) {
|
||||||
|
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,MODEL,SIZE,TYPE,TRAN").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("lsblk: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
bootDev := findLiveBootDevice()
|
||||||
|
|
||||||
|
var disks []InstallDisk
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
// NAME MODEL SIZE TYPE TRAN — model may have spaces so we parse from end
|
||||||
|
if len(fields) < 4 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Last field: TRAN, second-to-last: TYPE, third-to-last: SIZE
|
||||||
|
typ := fields[len(fields)-2]
|
||||||
|
size := fields[len(fields)-3]
|
||||||
|
name := fields[0]
|
||||||
|
model := strings.Join(fields[1:len(fields)-3], " ")
|
||||||
|
|
||||||
|
if typ != "disk" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
device := "/dev/" + name
|
||||||
|
if device == bootDev {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
sizeBytes := diskSizeBytes(device)
|
||||||
|
mounted := mountedParts(device)
|
||||||
|
|
||||||
|
disks = append(disks, InstallDisk{
|
||||||
|
Device: device,
|
||||||
|
Model: strings.TrimSpace(model),
|
||||||
|
Size: size,
|
||||||
|
SizeBytes: sizeBytes,
|
||||||
|
MountedParts: mounted,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return disks, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// diskSizeBytes returns the byte size of a block device using lsblk.
|
||||||
|
func diskSizeBytes(device string) int64 {
|
||||||
|
out, err := exec.Command("lsblk", "-bdn", "-o", "SIZE", device).Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
n, _ := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
// mountedParts returns a list of "<part> at <mountpoint>" strings for any
|
||||||
|
// mounted partitions on the given device.
|
||||||
|
func mountedParts(device string) []string {
|
||||||
|
out, err := exec.Command("lsblk", "-n", "-o", "NAME,MOUNTPOINT", device).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var result []string
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mp := fields[1]
|
||||||
|
if mp == "" || mp == "[SWAP]" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result = append(result, "/dev/"+strings.TrimLeft(fields[0], "└─├─")+" at "+mp)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// findLiveBootDevice returns the block device backing /run/live/medium (if any).
|
||||||
|
func findLiveBootDevice() string {
|
||||||
|
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", "/run/live/medium").Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
src := strings.TrimSpace(string(out))
|
||||||
|
if src == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
// Strip partition suffix to get the whole disk device.
|
||||||
|
// e.g. /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||||
|
out2, err := exec.Command("lsblk", "-no", "PKNAME", src).Output()
|
||||||
|
if err != nil || strings.TrimSpace(string(out2)) == "" {
|
||||||
|
return src
|
||||||
|
}
|
||||||
|
return "/dev/" + strings.TrimSpace(string(out2))
|
||||||
|
}
|
||||||
|
|
||||||
|
func mountSource(target string) string {
|
||||||
|
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
func mountFSType(target string) string {
|
||||||
|
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
func blockDeviceType(device string) string {
|
||||||
|
if strings.TrimSpace(device) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
func blockDeviceTransport(device string) string {
|
||||||
|
if strings.TrimSpace(device) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||||
|
switch {
|
||||||
|
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||||
|
return "ram"
|
||||||
|
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||||
|
return "cdrom"
|
||||||
|
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||||
|
return "usb"
|
||||||
|
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||||
|
return "cdrom"
|
||||||
|
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||||
|
return "disk"
|
||||||
|
default:
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||||
|
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||||
|
// Returns 0 if the squashfs is not available (non-live environment).
|
||||||
|
func MinInstallBytes() int64 {
|
||||||
|
fi, err := os.Stat(squashfsPath)
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return fi.Size() * 3 / 2
|
||||||
|
}
|
||||||
|
|
||||||
|
// toramActive returns true when the live system was booted with toram.
|
||||||
|
func toramActive() bool {
|
||||||
|
data, err := os.ReadFile("/proc/cmdline")
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return strings.Contains(string(data), "toram")
|
||||||
|
}
|
||||||
|
|
||||||
|
// freeMemBytes returns MemAvailable from /proc/meminfo.
|
||||||
|
func freeMemBytes() int64 {
|
||||||
|
data, err := os.ReadFile("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(data), "\n") {
|
||||||
|
if strings.HasPrefix(line, "MemAvailable:") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) >= 2 {
|
||||||
|
n, _ := strconv.ParseInt(fields[1], 10, 64)
|
||||||
|
return n * 1024 // kB → bytes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// DiskWarnings returns advisory warning strings for a disk candidate.
|
||||||
|
func DiskWarnings(d InstallDisk) []string {
|
||||||
|
var w []string
|
||||||
|
if len(d.MountedParts) > 0 {
|
||||||
|
w = append(w, "has mounted partitions: "+strings.Join(d.MountedParts, ", "))
|
||||||
|
}
|
||||||
|
min := MinInstallBytes()
|
||||||
|
if min > 0 && d.SizeBytes > 0 && d.SizeBytes < min {
|
||||||
|
w = append(w, fmt.Sprintf("disk may be too small (need ≥ %s, have %s)",
|
||||||
|
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||||
|
}
|
||||||
|
if toramActive() {
|
||||||
|
sqFi, err := os.Stat(squashfsPath)
|
||||||
|
if err == nil {
|
||||||
|
free := freeMemBytes()
|
||||||
|
if free > 0 && free < sqFi.Size()*2 {
|
||||||
|
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return w
|
||||||
|
}
|
||||||
|
|
||||||
|
func humanBytes(b int64) string {
|
||||||
|
const unit = 1024
|
||||||
|
if b < unit {
|
||||||
|
return fmt.Sprintf("%d B", b)
|
||||||
|
}
|
||||||
|
div, exp := int64(unit), 0
|
||||||
|
for n := b / unit; n >= unit; n /= unit {
|
||||||
|
div *= unit
|
||||||
|
exp++
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallToDisk runs bee-install <device> <logfile> and streams output to logFile.
|
||||||
|
// The context can be used to cancel.
|
||||||
|
func (s *System) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||||
|
cmd := exec.CommandContext(ctx, "bee-install", device, logFile)
|
||||||
|
return cmd.Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// InstallLogPath returns the default install log path for a given device.
|
||||||
|
func InstallLogPath(device string) string {
|
||||||
|
safe := strings.NewReplacer("/", "_", " ", "_").Replace(device)
|
||||||
|
return "/tmp/bee-install" + safe + ".log"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Label returns a display label for a disk.
|
||||||
|
func (d InstallDisk) Label() string {
|
||||||
|
model := d.Model
|
||||||
|
if model == "" {
|
||||||
|
model = "Unknown"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%s %s %s", d.Device, d.Size, model)
|
||||||
|
}
|
||||||
255
audit/internal/platform/install_to_ram.go
Normal file
255
audit/internal/platform/install_to_ram.go
Normal file
@@ -0,0 +1,255 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) IsLiveMediaInRAM() bool {
|
||||||
|
fsType := mountFSType("/run/live/medium")
|
||||||
|
if fsType == "" {
|
||||||
|
return toramActive()
|
||||||
|
}
|
||||||
|
return strings.EqualFold(fsType, "tmpfs")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) LiveBootSource() LiveBootSource {
|
||||||
|
fsType := mountFSType("/run/live/medium")
|
||||||
|
source := mountSource("/run/live/medium")
|
||||||
|
device := findLiveBootDevice()
|
||||||
|
status := LiveBootSource{
|
||||||
|
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||||
|
Source: source,
|
||||||
|
Device: device,
|
||||||
|
}
|
||||||
|
if fsType == "" && source == "" && device == "" {
|
||||||
|
if toramActive() {
|
||||||
|
status.InRAM = true
|
||||||
|
status.Kind = "ram"
|
||||||
|
status.Source = "tmpfs"
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
status.Kind = "unknown"
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||||
|
if status.Kind == "" {
|
||||||
|
status.Kind = "unknown"
|
||||||
|
}
|
||||||
|
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||||
|
status.Source = "tmpfs"
|
||||||
|
}
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
|
log := func(msg string) {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.IsLiveMediaInRAM() {
|
||||||
|
log("Already running from RAM — installation media can be safely disconnected.")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||||
|
if err != nil || len(squashfsFiles) == 0 {
|
||||||
|
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||||
|
}
|
||||||
|
|
||||||
|
free := freeMemBytes()
|
||||||
|
var needed int64
|
||||||
|
for _, sf := range squashfsFiles {
|
||||||
|
fi, err2 := os.Stat(sf)
|
||||||
|
if err2 != nil {
|
||||||
|
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||||
|
}
|
||||||
|
needed += fi.Size()
|
||||||
|
}
|
||||||
|
const headroom = 256 * 1024 * 1024
|
||||||
|
if free > 0 && needed+headroom > free {
|
||||||
|
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||||
|
humanBytes(needed+headroom), humanBytes(free))
|
||||||
|
}
|
||||||
|
|
||||||
|
dstDir := "/dev/shm/bee-live"
|
||||||
|
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||||
|
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, sf := range squashfsFiles {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
base := filepath.Base(sf)
|
||||||
|
dst := filepath.Join(dstDir, base)
|
||||||
|
log(fmt.Sprintf("Copying %s to RAM...", base))
|
||||||
|
if err := copyFileLarge(ctx, sf, dst, log); err != nil {
|
||||||
|
return fmt.Errorf("copy %s: %v", base, err)
|
||||||
|
}
|
||||||
|
log(fmt.Sprintf("Copied %s.", base))
|
||||||
|
|
||||||
|
loopDev, err := findLoopForFile(sf)
|
||||||
|
if err != nil {
|
||||||
|
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := reassociateLoopDevice(loopDev, dst); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
|
||||||
|
} else {
|
||||||
|
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log("Copying remaining medium files...")
|
||||||
|
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||||
|
}
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
log("Verifying live medium now served from RAM...")
|
||||||
|
status := s.LiveBootSource()
|
||||||
|
if err := verifyInstallToRAMStatus(status); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||||
|
log("Done. Installation media can be safely disconnected.")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func verifyInstallToRAMStatus(status LiveBootSource) error {
|
||||||
|
if status.InRAM {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
|
||||||
|
}
|
||||||
|
|
||||||
|
func describeLiveBootSource(status LiveBootSource) string {
|
||||||
|
source := strings.TrimSpace(status.Device)
|
||||||
|
if source == "" {
|
||||||
|
source = strings.TrimSpace(status.Source)
|
||||||
|
}
|
||||||
|
if source == "" {
|
||||||
|
source = "unknown source"
|
||||||
|
}
|
||||||
|
switch strings.TrimSpace(status.Kind) {
|
||||||
|
case "ram":
|
||||||
|
return "RAM"
|
||||||
|
case "usb":
|
||||||
|
return "USB (" + source + ")"
|
||||||
|
case "cdrom":
|
||||||
|
return "CD-ROM (" + source + ")"
|
||||||
|
case "disk":
|
||||||
|
return "disk (" + source + ")"
|
||||||
|
default:
|
||||||
|
return source
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
|
in, err := os.Open(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer in.Close()
|
||||||
|
fi, err := in.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
out, err := os.Create(dst)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer out.Close()
|
||||||
|
total := fi.Size()
|
||||||
|
var copied int64
|
||||||
|
buf := make([]byte, 4*1024*1024)
|
||||||
|
for {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
n, err := in.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
if _, werr := out.Write(buf[:n]); werr != nil {
|
||||||
|
return werr
|
||||||
|
}
|
||||||
|
copied += int64(n)
|
||||||
|
if logFunc != nil && total > 0 {
|
||||||
|
pct := int(float64(copied) / float64(total) * 100)
|
||||||
|
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err == io.EOF {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out.Sync()
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
|
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rel, _ := filepath.Rel(src, path)
|
||||||
|
target := filepath.Join(dst, rel)
|
||||||
|
if fi.IsDir() {
|
||||||
|
return os.MkdirAll(target, fi.Mode())
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(path, ".squashfs") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(target); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return copyFileLarge(ctx, path, target, nil)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func findLoopForFile(backingFile string) (string, error) {
|
||||||
|
out, err := exec.Command("losetup", "--list", "--json").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
var result struct {
|
||||||
|
Loopdevices []struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
BackFile string `json:"back-file"`
|
||||||
|
} `json:"loopdevices"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(out, &result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
for _, dev := range result.Loopdevices {
|
||||||
|
if dev.BackFile == backingFile {
|
||||||
|
return dev.Name, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||||
|
}
|
||||||
|
|
||||||
|
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||||
|
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return loopChangeFD(loopDev, newFile)
|
||||||
|
}
|
||||||
28
audit/internal/platform/install_to_ram_linux.go
Normal file
28
audit/internal/platform/install_to_ram_linux.go
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
//go:build linux
|
||||||
|
|
||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
const ioctlLoopChangeFD = 0x4C08
|
||||||
|
|
||||||
|
func loopChangeFD(loopDev, newFile string) error {
|
||||||
|
lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer lf.Close()
|
||||||
|
nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer nf.Close()
|
||||||
|
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
|
||||||
|
if errno != 0 {
|
||||||
|
return errno
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
9
audit/internal/platform/install_to_ram_other.go
Normal file
9
audit/internal/platform/install_to_ram_other.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
//go:build !linux
|
||||||
|
|
||||||
|
package platform
|
||||||
|
|
||||||
|
import "errors"
|
||||||
|
|
||||||
|
func loopChangeFD(loopDev, newFile string) error {
|
||||||
|
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||||
|
}
|
||||||
57
audit/internal/platform/install_to_ram_test.go
Normal file
57
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestInferLiveBootKind(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
fsType string
|
||||||
|
source string
|
||||||
|
deviceType string
|
||||||
|
transport string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||||
|
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||||
|
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||||
|
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||||
|
{name: "unknown", source: "overlay", want: "unknown"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||||
|
if got != tc.want {
|
||||||
|
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
|
||||||
|
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||||
|
}
|
||||||
|
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected verification failure when media is still on USB")
|
||||||
|
}
|
||||||
|
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
|
||||||
|
t.Fatalf("error=%q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDescribeLiveBootSource(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
|
||||||
|
t.Fatalf("got %q want RAM", got)
|
||||||
|
}
|
||||||
|
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
|
||||||
|
t.Fatalf("got %q want /run/live/medium", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
64
audit/internal/platform/kill_workers.go
Normal file
64
audit/internal/platform/kill_workers.go
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||||
|
// bee test worker processes that should be killed by KillTestWorkers.
|
||||||
|
var workerPatterns = []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"stress-ng",
|
||||||
|
"stressapptest",
|
||||||
|
"memtester",
|
||||||
|
}
|
||||||
|
|
||||||
|
// KilledProcess describes a process that was sent SIGKILL.
|
||||||
|
type KilledProcess struct {
|
||||||
|
PID int `json:"pid"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||||
|
// SIGKILL to each one found. It returns a list of killed processes.
|
||||||
|
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||||
|
func KillTestWorkers() []KilledProcess {
|
||||||
|
entries, err := os.ReadDir("/proc")
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var killed []KilledProcess
|
||||||
|
for _, e := range entries {
|
||||||
|
if !e.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
pid, err := strconv.Atoi(e.Name())
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// /proc/*/cmdline uses NUL bytes as argument separators.
|
||||||
|
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
||||||
|
exe := strings.TrimSpace(args[0])
|
||||||
|
base := exe
|
||||||
|
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||||
|
base = exe[idx+1:]
|
||||||
|
}
|
||||||
|
for _, pat := range workerPatterns {
|
||||||
|
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||||
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||||
|
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return killed
|
||||||
|
}
|
||||||
328
audit/internal/platform/live_metrics.go
Normal file
328
audit/internal/platform/live_metrics.go
Normal file
@@ -0,0 +1,328 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||||
|
// collected for the web UI metrics page.
|
||||||
|
type LiveMetricSample struct {
|
||||||
|
Timestamp time.Time `json:"ts"`
|
||||||
|
Fans []FanReading `json:"fans"`
|
||||||
|
Temps []TempReading `json:"temps"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||||
|
MemLoadPct float64 `json:"mem_load_pct"`
|
||||||
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TempReading is a named temperature sensor value.
|
||||||
|
type TempReading struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Group string `json:"group,omitempty"`
|
||||||
|
Celsius float64 `json:"celsius"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SampleLiveMetrics collects a single metrics snapshot from all available
|
||||||
|
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
|
||||||
|
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
|
||||||
|
func SampleLiveMetrics() LiveMetricSample {
|
||||||
|
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
||||||
|
|
||||||
|
// GPU metrics — try NVIDIA first, fall back to AMD
|
||||||
|
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
|
||||||
|
s.GPUs = gpus
|
||||||
|
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
|
||||||
|
s.GPUs = amdGPUs
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fan speeds — skipped silently if ipmitool unavailable
|
||||||
|
fans, _ := sampleFanSpeeds()
|
||||||
|
s.Fans = fans
|
||||||
|
|
||||||
|
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
|
||||||
|
if !hasTempGroup(s.Temps, "cpu") {
|
||||||
|
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
|
||||||
|
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// System power — returns 0 if unavailable
|
||||||
|
s.PowerW = sampleSystemPower()
|
||||||
|
|
||||||
|
// CPU load — from /proc/stat
|
||||||
|
s.CPULoadPct = sampleCPULoadPct()
|
||||||
|
|
||||||
|
// Memory load — from /proc/meminfo
|
||||||
|
s.MemLoadPct = sampleMemLoadPct()
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||||
|
// the overall CPU utilisation percentage.
|
||||||
|
func sampleCPULoadPct() float64 {
|
||||||
|
total0, idle0 := readCPUStat()
|
||||||
|
if total0 == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
time.Sleep(200 * time.Millisecond)
|
||||||
|
total1, idle1 := readCPUStat()
|
||||||
|
if total1 == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||||
|
dt := float64(total - prevTotal)
|
||||||
|
di := float64(idle - prevIdle)
|
||||||
|
if dt <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
pct := (1 - di/dt) * 100
|
||||||
|
if pct < 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
if pct > 100 {
|
||||||
|
return 100
|
||||||
|
}
|
||||||
|
return pct
|
||||||
|
}
|
||||||
|
|
||||||
|
func readCPUStat() (total, idle uint64) {
|
||||||
|
f, err := os.Open("/proc/stat")
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
sc := bufio.NewScanner(f)
|
||||||
|
for sc.Scan() {
|
||||||
|
line := sc.Text()
|
||||||
|
if !strings.HasPrefix(line, "cpu ") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fields := strings.Fields(line)[1:] // skip "cpu"
|
||||||
|
var vals [10]uint64
|
||||||
|
for i := 0; i < len(fields) && i < 10; i++ {
|
||||||
|
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
|
||||||
|
}
|
||||||
|
// idle = idle + iowait
|
||||||
|
idle = vals[3] + vals[4]
|
||||||
|
for _, v := range vals {
|
||||||
|
total += v
|
||||||
|
}
|
||||||
|
return total, idle
|
||||||
|
}
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleMemLoadPct() float64 {
|
||||||
|
f, err := os.Open("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
vals := map[string]uint64{}
|
||||||
|
sc := bufio.NewScanner(f)
|
||||||
|
for sc.Scan() {
|
||||||
|
fields := strings.Fields(sc.Text())
|
||||||
|
if len(fields) >= 2 {
|
||||||
|
v, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||||
|
vals[strings.TrimSuffix(fields[0], ":")] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total := vals["MemTotal"]
|
||||||
|
avail := vals["MemAvailable"]
|
||||||
|
if total == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
used := total - avail
|
||||||
|
return float64(used) / float64(total) * 100
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasTempGroup(temps []TempReading, group string) bool {
|
||||||
|
for _, t := range temps {
|
||||||
|
if t.Group == group {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTemperatureReadings() []TempReading {
|
||||||
|
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
return sampleLiveTempsViaIPMI()
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTempsViaSensorsJSON() []TempReading {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
|
||||||
|
temps := make([]TempReading, 0, len(chips))
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
featureNames := make([]string, 0, len(features))
|
||||||
|
for name := range features {
|
||||||
|
featureNames = append(featureNames, name)
|
||||||
|
}
|
||||||
|
sort.Strings(featureNames)
|
||||||
|
for _, name := range featureNames {
|
||||||
|
if strings.EqualFold(name, "Adapter") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
feature, ok := features[name].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, ok := firstTempInputValue(feature)
|
||||||
|
if !ok || value <= 0 || value > 150 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
group := classifyLiveTempGroup(chip, name)
|
||||||
|
if group == "gpu" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(name)
|
||||||
|
if label == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if group == "ambient" {
|
||||||
|
label = compactAmbientTempName(chip, label)
|
||||||
|
}
|
||||||
|
key := group + "\x00" + label
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleLiveTempsViaIPMI() []TempReading {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var temps []TempReading
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
parts := strings.Split(line, "|")
|
||||||
|
if len(parts) < 3 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(parts[0])
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := strings.ToLower(strings.TrimSpace(parts[2]))
|
||||||
|
if !strings.Contains(unit, "degrees") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
raw := strings.TrimSpace(parts[1])
|
||||||
|
if raw == "" || strings.EqualFold(raw, "na") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseFloat(raw, 64)
|
||||||
|
if err != nil || value <= 0 || value > 150 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
group := classifyLiveTempGroup("", name)
|
||||||
|
if group == "gpu" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := name
|
||||||
|
if group == "ambient" {
|
||||||
|
label = compactAmbientTempName("", label)
|
||||||
|
}
|
||||||
|
key := group + "\x00" + label
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||||
|
}
|
||||||
|
return temps
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstTempInputValue(feature map[string]any) (float64, bool) {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch value := feature[key].(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case string:
|
||||||
|
f, err := strconv.ParseFloat(value, 64)
|
||||||
|
if err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifyLiveTempGroup(chip, name string) string {
|
||||||
|
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
|
||||||
|
return "gpu"
|
||||||
|
case strings.Contains(text, "coretemp"),
|
||||||
|
strings.Contains(text, "k10temp"),
|
||||||
|
strings.Contains(text, "zenpower"),
|
||||||
|
strings.Contains(text, "package id"),
|
||||||
|
strings.Contains(text, "x86_pkg_temp"),
|
||||||
|
strings.Contains(text, "tctl"),
|
||||||
|
strings.Contains(text, "tdie"),
|
||||||
|
strings.Contains(text, "tccd"),
|
||||||
|
strings.Contains(text, "cpu"),
|
||||||
|
strings.Contains(text, "peci"):
|
||||||
|
return "cpu"
|
||||||
|
default:
|
||||||
|
return "ambient"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func compactAmbientTempName(chip, name string) string {
|
||||||
|
chip = strings.TrimSpace(chip)
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if chip == "" || strings.EqualFold(chip, name) {
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
return chip + " / " + name
|
||||||
|
}
|
||||||
94
audit/internal/platform/live_metrics_test.go
Normal file
94
audit/internal/platform/live_metrics_test.go
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestFirstTempInputValue(t *testing.T) {
|
||||||
|
feature := map[string]any{
|
||||||
|
"temp1_input": 61.5,
|
||||||
|
"temp1_max": 80.0,
|
||||||
|
}
|
||||||
|
got, ok := firstTempInputValue(feature)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected value")
|
||||||
|
}
|
||||||
|
if got != 61.5 {
|
||||||
|
t.Fatalf("got %v want 61.5", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifyLiveTempGroup(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
chip string
|
||||||
|
name string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
|
||||||
|
{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
|
||||||
|
{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
|
||||||
|
{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
|
||||||
|
t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCompactAmbientTempName(t *testing.T) {
|
||||||
|
if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
|
||||||
|
t.Fatalf("got %q", got)
|
||||||
|
}
|
||||||
|
if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
|
||||||
|
t.Fatalf("got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCPULoadPctBetween(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
prevTotal uint64
|
||||||
|
prevIdle uint64
|
||||||
|
total uint64
|
||||||
|
idle uint64
|
||||||
|
want float64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "busy half",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 90,
|
||||||
|
want: 50,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "fully busy",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 40,
|
||||||
|
want: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no progress",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 100,
|
||||||
|
idle: 40,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "idle delta larger than total clamps to zero",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 150,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||||
|
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,6 +2,7 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
|
|||||||
out := make([]InterfaceInfo, 0, len(names))
|
out := make([]InterfaceInfo, 0, len(names))
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
state := "unknown"
|
state := "unknown"
|
||||||
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
|
if up, err := interfaceAdminState(name); err == nil {
|
||||||
fields := strings.Fields(string(raw))
|
if up {
|
||||||
if len(fields) >= 9 {
|
state = "up"
|
||||||
state = fields[8]
|
} else {
|
||||||
|
state = "down"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var ipv4 []string
|
ipv4, err := interfaceIPv4Addrs(name)
|
||||||
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
|
if err != nil {
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
ipv4 = nil
|
||||||
fields := strings.Fields(line)
|
|
||||||
if len(fields) >= 4 {
|
|
||||||
ipv4 = append(ipv4, fields[3])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
||||||
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
|
||||||
|
names, err := listInterfaceNames()
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot := NetworkSnapshot{
|
||||||
|
Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
|
||||||
|
}
|
||||||
|
for _, name := range names {
|
||||||
|
up, err := interfaceAdminState(name)
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
ipv4, err := interfaceIPv4Addrs(name)
|
||||||
|
if err != nil {
|
||||||
|
return NetworkSnapshot{}, err
|
||||||
|
}
|
||||||
|
snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
|
||||||
|
Name: name,
|
||||||
|
Up: up,
|
||||||
|
IPv4: ipv4,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line != "" {
|
||||||
|
snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
|
||||||
|
snapshot.ResolvConf = string(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
return snapshot, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
|
||||||
|
var errs []string
|
||||||
|
|
||||||
|
for _, iface := range snapshot.Interfaces {
|
||||||
|
if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
|
||||||
|
}
|
||||||
|
for _, cidr := range iface.IPv4 {
|
||||||
|
if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
|
||||||
|
detail := strings.TrimSpace(string(raw))
|
||||||
|
if detail != "" {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
|
||||||
|
} else {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
state := "down"
|
||||||
|
if iface.Up {
|
||||||
|
state = "up"
|
||||||
|
}
|
||||||
|
if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
if !errors.As(err, &exitErr) {
|
||||||
|
errs = append(errs, fmt.Sprintf("clear default route: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, route := range snapshot.DefaultRoutes {
|
||||||
|
fields := strings.Fields(route)
|
||||||
|
if len(fields) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Strip state flags that ip-route(8) does not accept as add arguments.
|
||||||
|
filtered := fields[:0]
|
||||||
|
for _, f := range fields {
|
||||||
|
switch f {
|
||||||
|
case "linkdown", "dead", "onlink", "pervasive":
|
||||||
|
// skip
|
||||||
|
default:
|
||||||
|
filtered = append(filtered, f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
args := append([]string{"route", "add"}, filtered...)
|
||||||
|
if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
|
||||||
|
detail := strings.TrimSpace(string(raw))
|
||||||
|
if detail != "" {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
|
||||||
|
} else {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
|
||||||
|
errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(errs) > 0 {
|
||||||
|
return errors.New(strings.Join(errs, "; "))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) DHCPOne(iface string) (string, error) {
|
func (s *System) DHCPOne(iface string) (string, error) {
|
||||||
var out bytes.Buffer
|
var out bytes.Buffer
|
||||||
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
||||||
@@ -131,6 +241,65 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
|
|||||||
return out.String(), nil
|
return out.String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetInterfaceState brings a network interface up or down.
|
||||||
|
func (s *System) SetInterfaceState(iface string, up bool) error {
|
||||||
|
state := "down"
|
||||||
|
if up {
|
||||||
|
state = "up"
|
||||||
|
}
|
||||||
|
return exec.Command("ip", "link", "set", "dev", iface, state).Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetInterfaceState returns true if the interface is UP.
|
||||||
|
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||||
|
return interfaceAdminState(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceAdminState(iface string) (bool, error) {
|
||||||
|
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return parseInterfaceAdminState(string(raw))
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseInterfaceAdminState(raw string) (bool, error) {
|
||||||
|
start := strings.IndexByte(raw, '<')
|
||||||
|
if start == -1 {
|
||||||
|
return false, fmt.Errorf("ip link output missing flags")
|
||||||
|
}
|
||||||
|
end := strings.IndexByte(raw[start+1:], '>')
|
||||||
|
if end == -1 {
|
||||||
|
return false, fmt.Errorf("ip link output missing flag terminator")
|
||||||
|
}
|
||||||
|
flags := strings.Split(raw[start+1:start+1+end], ",")
|
||||||
|
for _, flag := range flags {
|
||||||
|
if strings.TrimSpace(flag) == "UP" {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||||
|
raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
|
||||||
|
if err != nil {
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
if errors.As(err, &exitErr) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var ipv4 []string
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) >= 4 {
|
||||||
|
ipv4 = append(ipv4, fields[3])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ipv4, nil
|
||||||
|
}
|
||||||
|
|
||||||
func listInterfaceNames() ([]string, error) {
|
func listInterfaceNames() ([]string, error) {
|
||||||
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
46
audit/internal/platform/network_test.go
Normal file
46
audit/internal/platform/network_test.go
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseInterfaceAdminState(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
raw string
|
||||||
|
want bool
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "admin up with no carrier",
|
||||||
|
raw: "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "admin down",
|
||||||
|
raw: "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "malformed output",
|
||||||
|
raw: "2: enp1s0: mtu 1500 state DOWN\n",
|
||||||
|
wantErr: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
got, err := parseInterfaceAdminState(tt.raw)
|
||||||
|
if tt.wantErr {
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if got != tt.want {
|
||||||
|
t.Fatalf("got %v want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
203
audit/internal/platform/nvidia_stress.go
Normal file
203
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
normalizeNvidiaStressOptions(&opts)
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(opts)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
|
job,
|
||||||
|
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaStressArchivePrefix(loader string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
return "gpu-nvidia-john"
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
return "gpu-nvidia-nccl"
|
||||||
|
default:
|
||||||
|
return "gpu-nvidia-burn"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||||
|
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return satJob{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||||
|
switch loader {
|
||||||
|
case "", NvidiaStressLoaderBuiltin:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-bee-gpu-burn.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-john-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-john-gpu-stress.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-nccl-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-bee-nccl-gpu-stress.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
default:
|
||||||
|
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||||
|
if opts.DurationSec <= 0 {
|
||||||
|
opts.DurationSec = 300
|
||||||
|
}
|
||||||
|
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||||
|
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||||
|
case "", NvidiaStressLoaderBuiltin:
|
||||||
|
opts.Loader = NvidiaStressLoaderBuiltin
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
opts.Loader = NvidiaStressLoaderJohn
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
opts.Loader = NvidiaStressLoaderNCCL
|
||||||
|
default:
|
||||||
|
opts.Loader = NvidiaStressLoaderBuiltin
|
||||||
|
}
|
||||||
|
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||||
|
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||||
|
all, err := listNvidiaGPUIndices()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(all) == 0 {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||||
|
}
|
||||||
|
|
||||||
|
selected := all
|
||||||
|
if len(include) > 0 {
|
||||||
|
want := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
want[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
selected = selected[:0]
|
||||||
|
for _, idx := range all {
|
||||||
|
if _, ok := want[idx]; ok {
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||||
|
}
|
||||||
|
out := append([]int(nil), selected...)
|
||||||
|
sort.Ints(out)
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func listNvidiaGPUIndices() ([]int, error) {
|
||||||
|
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(line)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
indices = append(indices, idx)
|
||||||
|
}
|
||||||
|
return dedupeSortedIndices(indices), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func dedupeSortedIndices(values []int) []int {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
seen := make(map[int]struct{}, len(values))
|
||||||
|
out := make([]int, 0, len(values))
|
||||||
|
for _, value := range values {
|
||||||
|
if value < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[value]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[value] = struct{}{}
|
||||||
|
out = append(out, value)
|
||||||
|
}
|
||||||
|
sort.Ints(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinIndexList(values []int) string {
|
||||||
|
parts := make([]string, 0, len(values))
|
||||||
|
for _, value := range values {
|
||||||
|
parts = append(parts, strconv.Itoa(value))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
545
audit/internal/platform/platform_stress.go
Normal file
545
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,545 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"bytes"
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"encoding/csv"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PlatformStressCycle defines one load+idle cycle.
|
||||||
|
type PlatformStressCycle struct {
|
||||||
|
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||||
|
IdleSec int // seconds of idle monitoring after load cut
|
||||||
|
}
|
||||||
|
|
||||||
|
// PlatformStressOptions controls the thermal cycling test.
|
||||||
|
type PlatformStressOptions struct {
|
||||||
|
Cycles []PlatformStressCycle
|
||||||
|
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||||
|
}
|
||||||
|
|
||||||
|
// platformStressRow is one second of telemetry.
|
||||||
|
type platformStressRow struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
Cycle int
|
||||||
|
Phase string // "load" | "idle"
|
||||||
|
CPULoadPct float64
|
||||||
|
MaxCPUTempC float64
|
||||||
|
MaxGPUTempC float64
|
||||||
|
SysPowerW float64
|
||||||
|
FanMinRPM float64
|
||||||
|
FanMaxRPM float64
|
||||||
|
GPUThrottled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||||
|
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||||
|
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||||
|
func (s *System) RunPlatformStress(
|
||||||
|
ctx context.Context,
|
||||||
|
baseDir string,
|
||||||
|
opts PlatformStressOptions,
|
||||||
|
logFunc func(string),
|
||||||
|
) (string, error) {
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if len(opts.Cycles) == 0 {
|
||||||
|
return "", fmt.Errorf("no cycles defined")
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
stamp := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||||
|
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||||
|
|
||||||
|
vendor := s.DetectGPUVendor()
|
||||||
|
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||||
|
|
||||||
|
var rows []platformStressRow
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
var analyses []cycleAnalysis
|
||||||
|
|
||||||
|
for i, cycle := range opts.Cycles {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
cycleNum := i + 1
|
||||||
|
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||||
|
|
||||||
|
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||||
|
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
// CPU stress
|
||||||
|
if hasCPU {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||||
|
if err != nil {
|
||||||
|
logFunc("CPU stress: " + err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU stress
|
||||||
|
if hasGPU {
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||||
|
if gpuCmd == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = gpuCmd.Wait()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Monitoring goroutine for load phase
|
||||||
|
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||||
|
for _, r := range loadRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, loadRows...)
|
||||||
|
loadCancel()
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||||
|
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||||
|
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||||
|
for _, r := range idleRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, idleRows...)
|
||||||
|
idleCancel()
|
||||||
|
|
||||||
|
// Per-cycle analysis
|
||||||
|
an := analyzePlatformCycle(loadRows, idleRows)
|
||||||
|
analyses = append(analyses, an)
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||||
|
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write CSV
|
||||||
|
csvData := writePlatformCSV(rows)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||||
|
|
||||||
|
// Write summary
|
||||||
|
summary := writePlatformSummary(opts, analyses)
|
||||||
|
logFunc("--- Summary ---")
|
||||||
|
for _, line := range strings.Split(summary, "\n") {
|
||||||
|
if line != "" {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||||
|
|
||||||
|
// Pack tar.gz
|
||||||
|
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||||
|
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||||
|
return "", fmt.Errorf("pack archive: %w", err)
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(runDir)
|
||||||
|
return archivePath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectPhase samples live metrics every second until ctx is done.
|
||||||
|
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||||
|
var rows []platformStressRow
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return rows
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := SampleLiveMetrics()
|
||||||
|
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||||
|
r := platformStressRow{
|
||||||
|
ElapsedSec: time.Since(testStart).Seconds(),
|
||||||
|
Cycle: cycle,
|
||||||
|
Phase: phase,
|
||||||
|
CPULoadPct: s.CPULoadPct,
|
||||||
|
SysPowerW: s.PowerW,
|
||||||
|
}
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
switch t.Group {
|
||||||
|
case "cpu":
|
||||||
|
if t.Celsius > r.MaxCPUTempC {
|
||||||
|
r.MaxCPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
case "gpu":
|
||||||
|
if t.Celsius > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if g.TempC > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = g.TempC
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(s.Fans) > 0 {
|
||||||
|
r.FanMinRPM = s.Fans[0].RPM
|
||||||
|
r.FanMaxRPM = s.Fans[0].RPM
|
||||||
|
for _, f := range s.Fans[1:] {
|
||||||
|
if f.RPM < r.FanMinRPM {
|
||||||
|
r.FanMinRPM = f.RPM
|
||||||
|
}
|
||||||
|
if f.RPM > r.FanMaxRPM {
|
||||||
|
r.FanMaxRPM = f.RPM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPlatformRow(r platformStressRow) string {
|
||||||
|
throttle := ""
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttle = " THROTTLE"
|
||||||
|
}
|
||||||
|
fans := ""
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||||
|
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||||
|
}
|
||||||
|
|
||||||
|
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||||
|
var an cycleAnalysis
|
||||||
|
for _, r := range loadRows {
|
||||||
|
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||||
|
an.maxCPUTemp = r.MaxCPUTempC
|
||||||
|
}
|
||||||
|
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||||
|
an.maxGPUTemp = r.MaxGPUTempC
|
||||||
|
}
|
||||||
|
if r.SysPowerW > an.maxPower {
|
||||||
|
an.maxPower = r.SysPowerW
|
||||||
|
}
|
||||||
|
if r.GPUThrottled {
|
||||||
|
an.throttled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM at cut = avg of last 5 load rows
|
||||||
|
if n := len(loadRows); n > 0 {
|
||||||
|
window := loadRows
|
||||||
|
if n > 5 {
|
||||||
|
window = loadRows[n-5:]
|
||||||
|
}
|
||||||
|
var sum float64
|
||||||
|
var cnt int
|
||||||
|
for _, r := range window {
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
cnt++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cnt > 0 {
|
||||||
|
an.fanAtCutAvg = sum / float64(cnt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM min in first 15s of idle
|
||||||
|
an.fanMin15s = an.fanAtCutAvg
|
||||||
|
var cutElapsed float64
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||||
|
}
|
||||||
|
for _, r := range idleRows {
|
||||||
|
if r.ElapsedSec > cutElapsed+15 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||||
|
an.fanMin15s = avg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||||
|
}
|
||||||
|
return an
|
||||||
|
}
|
||||||
|
|
||||||
|
type cycleAnalysis struct {
|
||||||
|
maxCPUTemp float64
|
||||||
|
maxGPUTemp float64
|
||||||
|
maxPower float64
|
||||||
|
throttled bool
|
||||||
|
fanAtCutAvg float64
|
||||||
|
fanMin15s float64
|
||||||
|
fanDropPct float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||||
|
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||||
|
|
||||||
|
totalThrottle := 0
|
||||||
|
totalFanWarn := 0
|
||||||
|
for i, an := range analyses {
|
||||||
|
cycle := opts.Cycles[i]
|
||||||
|
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||||
|
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||||
|
if an.throttled {
|
||||||
|
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||||
|
totalThrottle++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Throttle: none\n")
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||||
|
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||||
|
if an.fanDropPct > 20 {
|
||||||
|
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||||
|
totalFanWarn++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||||
|
if totalThrottle > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||||
|
} else if totalFanWarn > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
w := csv.NewWriter(&buf)
|
||||||
|
_ = w.Write([]string{
|
||||||
|
"elapsed_sec", "cycle", "phase",
|
||||||
|
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||||
|
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||||
|
})
|
||||||
|
for _, r := range rows {
|
||||||
|
throttled := "0"
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttled = "1"
|
||||||
|
}
|
||||||
|
_ = w.Write([]string{
|
||||||
|
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||||
|
strconv.Itoa(r.Cycle),
|
||||||
|
r.Phase,
|
||||||
|
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||||
|
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||||
|
throttled,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
w.Flush()
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||||
|
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||||
|
path, err := satLookPath("stressapptest")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||||
|
}
|
||||||
|
// Use a very long duration; the context timeout will kill it at the right time.
|
||||||
|
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||||
|
if threads := platformStressCPUThreads(); threads > 0 {
|
||||||
|
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||||
|
}
|
||||||
|
if mb := platformStressMemoryMB(); mb > 0 {
|
||||||
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||||
|
}
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
|
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||||
|
switch strings.ToLower(vendor) {
|
||||||
|
case "amd":
|
||||||
|
return buildAMDGPUStressCmd(ctx)
|
||||||
|
case "nvidia":
|
||||||
|
return buildNvidiaGPUStressCmd(ctx)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
rvsArgs, err := resolveRVSCommand()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rvsPath := rvsArgs[0]
|
||||||
|
cfg := `actions:
|
||||||
|
- name: gst_platform
|
||||||
|
device: all
|
||||||
|
module: gst
|
||||||
|
parallel: true
|
||||||
|
duration: 86400000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size_a: 8640
|
||||||
|
matrix_size_b: 8640
|
||||||
|
matrix_size_c: 8640
|
||||||
|
`
|
||||||
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
|
if err != nil {
|
||||||
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func platformStressCPUThreads() int {
|
||||||
|
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
cpus := runtime.NumCPU()
|
||||||
|
switch {
|
||||||
|
case cpus <= 2:
|
||||||
|
return 1
|
||||||
|
case cpus <= 8:
|
||||||
|
return cpus - 1
|
||||||
|
default:
|
||||||
|
return cpus - 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func platformStressMemoryMB() int {
|
||||||
|
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||||
|
return mb
|
||||||
|
}
|
||||||
|
free := freeMemBytes()
|
||||||
|
if free <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||||
|
if mb < 1024 {
|
||||||
|
return 1024
|
||||||
|
}
|
||||||
|
return mb
|
||||||
|
}
|
||||||
|
|
||||||
|
func containsComponent(components []string, name string) bool {
|
||||||
|
for _, c := range components {
|
||||||
|
if c == name {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func packPlatformDir(dir, dest string) error {
|
||||||
|
f, err := os.Create(dest)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
gz := gzip.NewWriter(f)
|
||||||
|
defer gz.Close()
|
||||||
|
tw := tar.NewWriter(gz)
|
||||||
|
defer tw.Close()
|
||||||
|
|
||||||
|
entries, err := os.ReadDir(dir)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
base := filepath.Base(dir)
|
||||||
|
for _, e := range entries {
|
||||||
|
if e.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fpath := filepath.Join(dir, e.Name())
|
||||||
|
data, err := os.ReadFile(fpath)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
hdr := &tar.Header{
|
||||||
|
Name: filepath.Join(base, e.Name()),
|
||||||
|
Size: int64(len(data)),
|
||||||
|
Mode: 0644,
|
||||||
|
ModTime: time.Now(),
|
||||||
|
}
|
||||||
|
if err := tw.WriteHeader(hdr); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := tw.Write(data); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"runtime"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||||
|
if got := platformStressCPUThreads(); got != 7 {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||||
|
got := platformStressCPUThreads()
|
||||||
|
if got < 1 {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||||
|
}
|
||||||
|
if got > runtime.NumCPU() {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||||
|
}
|
||||||
|
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||||
|
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||||
|
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||||
|
if got := platformStressMemoryMB(); got != 8192 {
|
||||||
|
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -135,9 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
case "nvidia":
|
case "nvidia":
|
||||||
tools = append(tools, s.CheckTools([]string{
|
tools = append(tools, s.CheckTools([]string{
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
|
"dcgmi",
|
||||||
|
"nv-hostengine",
|
||||||
"nvidia-bug-report.sh",
|
"nvidia-bug-report.sh",
|
||||||
"bee-gpu-stress",
|
"bee-gpu-burn",
|
||||||
|
"bee-john-gpu-stress",
|
||||||
|
"bee-nccl-gpu-stress",
|
||||||
|
"all_reduce_perf",
|
||||||
})...)
|
})...)
|
||||||
|
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
|
||||||
case "amd":
|
case "amd":
|
||||||
tool := ToolStatus{Name: "rocm-smi"}
|
tool := ToolStatus{Name: "rocm-smi"}
|
||||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||||
@@ -152,6 +158,16 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
return tools
|
return tools
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
path, err := exec.LookPath(candidate)
|
||||||
|
if err == nil {
|
||||||
|
return ToolStatus{Name: display, Path: path, OK: true}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ToolStatus{Name: display}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||||
lsmodText := commandText("lsmod")
|
lsmodText := commandText("lsmod")
|
||||||
|
|
||||||
@@ -176,8 +192,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
|||||||
health.DriverReady = true
|
health.DriverReady = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
health.CUDAReady = true
|
health.CUDAReady = true
|
||||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/tar"
|
"archive/tar"
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
@@ -13,6 +15,8 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -30,19 +34,63 @@ var (
|
|||||||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||||
}
|
}
|
||||||
|
rvsExecutableGlobs = []string{
|
||||||
|
"/opt/rocm/bin/rvs",
|
||||||
|
"/opt/rocm-*/bin/rvs",
|
||||||
|
}
|
||||||
|
dcgmProfTesterCandidates = []string{
|
||||||
|
"dcgmproftester",
|
||||||
|
"dcgmproftester13",
|
||||||
|
"dcgmproftester12",
|
||||||
|
"dcgmproftester11",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||||
|
// Returns combined stdout+stderr as a byte slice.
|
||||||
|
func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||||
|
pr, pw := io.Pipe()
|
||||||
|
cmd.Stdout = pw
|
||||||
|
cmd.Stderr = pw
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
scanner := bufio.NewScanner(pr)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
buf.WriteString(line + "\n")
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
err := cmd.Start()
|
||||||
|
if err != nil {
|
||||||
|
_ = pw.Close()
|
||||||
|
wg.Wait()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
waitErr := cmd.Wait()
|
||||||
|
_ = pw.Close()
|
||||||
|
wg.Wait()
|
||||||
|
return buf.Bytes(), waitErr
|
||||||
|
}
|
||||||
|
|
||||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||||
type NvidiaGPU struct {
|
type NvidiaGPU struct {
|
||||||
Index int
|
Index int `json:"index"`
|
||||||
Name string
|
Name string `json:"name"`
|
||||||
MemoryMB int
|
MemoryMB int `json:"memory_mb"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||||
type AMDGPUInfo struct {
|
type AMDGPUInfo struct {
|
||||||
Index int
|
Index int `json:"index"`
|
||||||
Name string
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||||
@@ -53,6 +101,12 @@ func (s *System) DetectGPUVendor() string {
|
|||||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
return "amd"
|
return "amd"
|
||||||
}
|
}
|
||||||
|
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||||
|
text := strings.ToLower(string(raw))
|
||||||
|
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||||
|
return "amd"
|
||||||
|
}
|
||||||
|
}
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,13 +134,103 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||||||
func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
|
||||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||||||
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
})
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||||
|
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: mem_integrity
|
||||||
|
device: all
|
||||||
|
module: mem
|
||||||
|
parallel: true
|
||||||
|
duration: 60000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 8640
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||||
|
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: babel_mem_bw
|
||||||
|
device: all
|
||||||
|
module: babel
|
||||||
|
parallel: true
|
||||||
|
copy_matrix: true
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 134217728
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
|
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||||
|
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||||
|
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||||
|
rvsCfg := amdStressRVSConfig(seconds)
|
||||||
|
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressRVSConfig(seconds int) string {
|
||||||
|
return fmt.Sprintf(`actions:
|
||||||
|
- name: gst_stress
|
||||||
|
device: all
|
||||||
|
module: gst
|
||||||
|
parallel: true
|
||||||
|
duration: %d
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size_a: 8640
|
||||||
|
matrix_size_b: 8640
|
||||||
|
matrix_size_c: 8640
|
||||||
|
`, seconds*1000)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||||
|
return []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
|
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||||
@@ -118,42 +262,213 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|||||||
MemoryMB: memMB,
|
MemoryMB: memMB,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
|
return gpus[i].Index < gpus[j].Index
|
||||||
|
})
|
||||||
return gpus, nil
|
return gpus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
// detect GPU count
|
||||||
|
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||||
|
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||||
|
if gpuCount < 1 {
|
||||||
|
gpuCount = 1
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
|
}},
|
||||||
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
|
if err != nil {
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
|
return "", err
|
||||||
|
}
|
||||||
|
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||||
|
{
|
||||||
|
name: "03-dcgmproftester.log",
|
||||||
|
cmd: profCmd,
|
||||||
|
env: nvidiaVisibleDevicesEnv(selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{
|
||||||
|
name: "02-dcgmi-targeted-power.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{
|
||||||
|
name: "02-dcgmi-pulse-test.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{
|
||||||
|
name: "02-dcgmi-nvbandwidth.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||||
|
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||||
|
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||||
|
// ctx cancellation kills the running job.
|
||||||
|
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{
|
||||||
|
name: "02-dcgmi-targeted-stress.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
return dedupeSortedIndices(gpuIndices), nil
|
||||||
|
}
|
||||||
|
all, err := listNvidiaGPUIndices()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(all) == 0 {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||||
|
}
|
||||||
|
return all, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||||
return runAcceptancePack(baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
})
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
|
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||||
|
sizeArg := "80%"
|
||||||
|
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||||
|
sizeArg = fmt.Sprintf("%dM", mb)
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||||
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
|
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||||
|
"stress-ng", "--vm", "1",
|
||||||
|
"--vm-bytes", sizeArg,
|
||||||
|
"--vm-method", "all",
|
||||||
|
"--timeout", fmt.Sprintf("%d", seconds),
|
||||||
|
"--metrics-brief",
|
||||||
|
}},
|
||||||
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
seconds := durationSec
|
||||||
|
if seconds <= 0 {
|
||||||
|
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
|
||||||
|
}
|
||||||
|
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
||||||
|
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
||||||
|
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
|
||||||
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
|
{name: "02-stressapptest.log", cmd: cmd},
|
||||||
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
if durationSec <= 0 {
|
if durationSec <= 0 {
|
||||||
durationSec = 60
|
durationSec = 60
|
||||||
}
|
}
|
||||||
return runAcceptancePack(baseDir, "cpu", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
|
||||||
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
||||||
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||||||
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||||||
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
||||||
})
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -181,11 +496,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for index, devPath := range devices {
|
for index, devPath := range devices {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||||
commands := storageSATCommands(devPath)
|
commands := storageSATCommands(devPath)
|
||||||
for cmdIndex, job := range commands {
|
for cmdIndex, job := range commands {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||||
out, err := runSATCommand(verboseLog, job.name, job.cmd)
|
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
@@ -223,83 +544,64 @@ type satStats struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaSATJobs() []satJob {
|
func nvidiaSATJobs() []satJob {
|
||||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
|
||||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
if baseDir == "" {
|
if diagLevel < 1 || diagLevel > 4 {
|
||||||
baseDir = "/var/log/bee-sat"
|
diagLevel = 3
|
||||||
}
|
}
|
||||||
ts := time.Now().UTC().Format("20060102-150405")
|
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
|
||||||
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
|
||||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
||||||
|
|
||||||
var summary strings.Builder
|
|
||||||
stats := satStats{}
|
|
||||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
|
||||||
for _, job := range jobs {
|
|
||||||
cmd := make([]string, 0, len(job.cmd))
|
|
||||||
for _, arg := range job.cmd {
|
|
||||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
|
||||||
}
|
|
||||||
out, err := runSATCommand(verboseLog, job.name, cmd)
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
|
||||||
return "", writeErr
|
|
||||||
}
|
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
|
||||||
stats.Add(status)
|
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
|
||||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
|
||||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
|
||||||
}
|
|
||||||
writeSATStats(&summary, stats)
|
|
||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
|
|
||||||
var env []string
|
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
ids := make([]string, len(gpuIndices))
|
ids := make([]string, len(gpuIndices))
|
||||||
for i, idx := range gpuIndices {
|
for i, idx := range gpuIndices {
|
||||||
ids[i] = strconv.Itoa(idx)
|
ids[i] = strconv.Itoa(idx)
|
||||||
}
|
}
|
||||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||||
}
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||||
{
|
|
||||||
name: "05-bee-gpu-stress.log",
|
|
||||||
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
|
|
||||||
env: env,
|
|
||||||
collectGPU: true,
|
|
||||||
gpuIndices: gpuIndices,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
|
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||||
|
args := []string{"dcgmi", "diag", "-r", name}
|
||||||
|
if durationSec > 0 {
|
||||||
|
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
args = append(args, "-i", joinIndexList(gpuIndices))
|
||||||
|
}
|
||||||
|
return args
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaBurnDuration(durationSec int) int {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 300
|
||||||
|
}
|
||||||
|
return durationSec
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||||||
|
if len(gpuIndices) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
||||||
|
}
|
||||||
|
|
||||||
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -326,9 +628,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
var err error
|
var err error
|
||||||
|
|
||||||
if job.collectGPU {
|
if job.collectGPU {
|
||||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
|
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||||
} else {
|
} else {
|
||||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
|
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
@@ -352,13 +654,16 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
return archive, nil
|
return archive, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
resolvedCmd, err := resolveSATCommand(cmd)
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
)
|
)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
@@ -370,10 +675,17 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
|||||||
}
|
}
|
||||||
|
|
||||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||||
|
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
c.Cancel = func() error {
|
||||||
|
if c.Process != nil {
|
||||||
|
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
if len(env) > 0 {
|
if len(env) > 0 {
|
||||||
c.Env = append(os.Environ(), env...)
|
c.Env = append(os.Environ(), env...)
|
||||||
}
|
}
|
||||||
out, err := c.CombinedOutput()
|
out, err := streamExecOutput(c, logFunc)
|
||||||
|
|
||||||
rc := 0
|
rc := 0
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -448,27 +760,39 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
text := strings.ToLower(string(out))
|
text := strings.ToLower(string(out))
|
||||||
|
// No output at all means the tool failed to start (mlock limit, binary missing,
|
||||||
|
// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
|
||||||
|
if len(strings.TrimSpace(text)) == 0 {
|
||||||
|
return "UNSUPPORTED", rc
|
||||||
|
}
|
||||||
if strings.Contains(text, "unsupported") ||
|
if strings.Contains(text, "unsupported") ||
|
||||||
strings.Contains(text, "not supported") ||
|
strings.Contains(text, "not supported") ||
|
||||||
|
strings.Contains(text, "not found in path") ||
|
||||||
strings.Contains(text, "invalid opcode") ||
|
strings.Contains(text, "invalid opcode") ||
|
||||||
strings.Contains(text, "unknown command") ||
|
strings.Contains(text, "unknown command") ||
|
||||||
strings.Contains(text, "not implemented") ||
|
strings.Contains(text, "not implemented") ||
|
||||||
strings.Contains(text, "not available") ||
|
strings.Contains(text, "not available") ||
|
||||||
strings.Contains(text, "cuda_error_system_not_ready") ||
|
strings.Contains(text, "cuda_error_system_not_ready") ||
|
||||||
strings.Contains(text, "no such device") ||
|
strings.Contains(text, "no such device") ||
|
||||||
|
// nvidia-smi on a machine with no NVIDIA GPU
|
||||||
|
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||||
|
strings.Contains(text, "no nvidia gpu") ||
|
||||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||||
return "UNSUPPORTED", rc
|
return "UNSUPPORTED", rc
|
||||||
}
|
}
|
||||||
return "FAILED", rc
|
return "FAILED", rc
|
||||||
}
|
}
|
||||||
|
|
||||||
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
|
||||||
start := time.Now().UTC()
|
start := time.Now().UTC()
|
||||||
resolvedCmd, err := resolveSATCommand(cmd)
|
resolvedCmd, err := resolveSATCommand(cmd)
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||||
)
|
)
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
appendSATVerboseLog(verboseLog,
|
appendSATVerboseLog(verboseLog,
|
||||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||||
@@ -479,7 +803,7 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
|||||||
return []byte(err.Error() + "\n"), err
|
return []byte(err.Error() + "\n"), err
|
||||||
}
|
}
|
||||||
|
|
||||||
out, err := satExecCommand(resolvedCmd[0], resolvedCmd[1:]...).CombinedOutput()
|
out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)
|
||||||
|
|
||||||
rc := 0
|
rc := 0
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -506,10 +830,27 @@ func resolveSATCommand(cmd []string) ([]string, error) {
|
|||||||
if len(cmd) == 0 {
|
if len(cmd) == 0 {
|
||||||
return nil, errors.New("empty SAT command")
|
return nil, errors.New("empty SAT command")
|
||||||
}
|
}
|
||||||
if cmd[0] != "rocm-smi" {
|
switch cmd[0] {
|
||||||
return cmd, nil
|
case "rocm-smi":
|
||||||
|
return resolveROCmSMICommand(cmd[1:]...)
|
||||||
|
case "rvs":
|
||||||
|
return resolveRVSCommand(cmd[1:]...)
|
||||||
}
|
}
|
||||||
return resolveROCmSMICommand(cmd[1:]...)
|
path, err := satLookPath(cmd[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||||||
|
}
|
||||||
|
return append([]string{path}, cmd[1:]...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||||
|
if path, err := satLookPath("rvs"); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
for _, path := range expandExistingPaths(rvsExecutableGlobs) {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
return nil, errors.New("rvs not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||||
@@ -533,6 +874,29 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
|||||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
|
||||||
|
for _, candidate := range dcgmProfTesterCandidates {
|
||||||
|
if path, err := satLookPath(candidate); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, errors.New("dcgmproftester not found in PATH")
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureAMDRuntimeReady() error {
|
||||||
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
|
||||||
|
state := strings.TrimSpace(string(raw))
|
||||||
|
if strings.EqualFold(state, "live") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
|
||||||
|
}
|
||||||
|
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
|
||||||
|
}
|
||||||
|
|
||||||
func rocmSMIExecutableCandidates() []string {
|
func rocmSMIExecutableCandidates() []string {
|
||||||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||||
}
|
}
|
||||||
@@ -581,7 +945,7 @@ func parseStorageDevices(raw string) []string {
|
|||||||
|
|
||||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
|
||||||
stopCh := make(chan struct{})
|
stopCh := make(chan struct{})
|
||||||
doneCh := make(chan struct{})
|
doneCh := make(chan struct{})
|
||||||
var metricRows []GPUMetricRow
|
var metricRows []GPUMetricRow
|
||||||
@@ -609,7 +973,7 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
|
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
||||||
|
|
||||||
close(stopCh)
|
close(stopCh)
|
||||||
<-doneCh
|
<-doneCh
|
||||||
|
|||||||
@@ -2,10 +2,12 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -49,6 +51,18 @@ type FanStressRow struct {
|
|||||||
SysPowerW float64 // DCMI system power reading
|
SysPowerW float64 // DCMI system power reading
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type cachedPowerReading struct {
|
||||||
|
Value float64
|
||||||
|
UpdatedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
systemPowerCacheMu sync.Mutex
|
||||||
|
systemPowerCache cachedPowerReading
|
||||||
|
)
|
||||||
|
|
||||||
|
const systemPowerHoldTTL = 15 * time.Second
|
||||||
|
|
||||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
@@ -128,26 +142,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
|||||||
stats.OK++
|
stats.OK++
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
setPhase(phaseName)
|
setPhase(phaseName)
|
||||||
var env []string
|
|
||||||
if len(opts.GPUIndices) > 0 {
|
|
||||||
ids := make([]string, len(opts.GPUIndices))
|
|
||||||
for i, idx := range opts.GPUIndices {
|
|
||||||
ids[i] = strconv.Itoa(idx)
|
|
||||||
}
|
|
||||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
|
||||||
}
|
|
||||||
cmd := []string{
|
cmd := []string{
|
||||||
"bee-gpu-stress",
|
"bee-gpu-burn",
|
||||||
"--seconds", strconv.Itoa(durSec),
|
"--seconds", strconv.Itoa(durSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env)
|
if len(opts.GPUIndices) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||||
|
}
|
||||||
|
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||||
@@ -304,41 +313,148 @@ func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
|||||||
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||||
func sampleFanSpeeds() ([]FanReading, error) {
|
func sampleFanSpeeds() ([]FanReading, error) {
|
||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
|
if err == nil {
|
||||||
|
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||||
|
if len(fans) > 0 {
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return parseFanSpeeds(string(out)), nil
|
return nil, sensorsErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
// Line format: "FAN1 | 2400.000 | RPM | ok"
|
// Handles two formats:
|
||||||
|
//
|
||||||
|
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||||
|
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||||
func parseFanSpeeds(raw string) []FanReading {
|
func parseFanSpeeds(raw string) []FanReading {
|
||||||
var fans []FanReading
|
var fans []FanReading
|
||||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
parts := strings.Split(line, "|")
|
parts := strings.Split(line, "|")
|
||||||
if len(parts) < 3 {
|
if len(parts) < 2 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
unit := strings.TrimSpace(parts[2])
|
name := strings.TrimSpace(parts[0])
|
||||||
if !strings.EqualFold(unit, "RPM") {
|
// Find the first field that contains "RPM" (either as a standalone unit or inline)
|
||||||
|
rpmVal := 0.0
|
||||||
|
found := false
|
||||||
|
for _, p := range parts[1:] {
|
||||||
|
p = strings.TrimSpace(p)
|
||||||
|
if !strings.Contains(strings.ToUpper(p), "RPM") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.EqualFold(p, "RPM") {
|
||||||
|
continue // unit-only column in old format; value is in previous field
|
||||||
|
}
|
||||||
|
val, err := parseFanRPMValue(p)
|
||||||
|
if err == nil {
|
||||||
|
rpmVal = val
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Old format: unit "RPM" is in col[2], value is in col[1]
|
||||||
|
if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
|
||||||
|
valStr := strings.TrimSpace(parts[1])
|
||||||
|
if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
|
||||||
|
if val, err := parseFanRPMValue(valStr); err == nil {
|
||||||
|
rpmVal = val
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
valStr := strings.TrimSpace(parts[1])
|
fans = append(fans, FanReading{Name: name, RPM: rpmVal})
|
||||||
if strings.EqualFold(valStr, "na") || strings.EqualFold(valStr, "disabled") || valStr == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
val, err := strconv.ParseFloat(valStr, 64)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
fans = append(fans, FanReading{
|
|
||||||
Name: strings.TrimSpace(parts[0]),
|
|
||||||
RPM: val,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
return fans
|
return fans
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseFanRPMValue(raw string) (float64, error) {
|
||||||
|
fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
|
||||||
|
if len(fields) == 0 {
|
||||||
|
return 0, strconv.ErrSyntax
|
||||||
|
}
|
||||||
|
return strconv.ParseFloat(fields[0], 64)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
var fans []FanReading
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
names := make([]string, 0, len(features))
|
||||||
|
for name := range features {
|
||||||
|
names = append(names, name)
|
||||||
|
}
|
||||||
|
sort.Strings(names)
|
||||||
|
for _, name := range names {
|
||||||
|
feature, ok := features[name].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rpm, ok := firstFanInputValue(feature)
|
||||||
|
if !ok || rpm <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(name)
|
||||||
|
if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
|
||||||
|
label = chip + " / " + label
|
||||||
|
}
|
||||||
|
if _, ok := seen[label]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[label] = struct{}{}
|
||||||
|
fans = append(fans, FanReading{Name: label, RPM: rpm})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fans, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch value := feature[key].(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case string:
|
||||||
|
f, err := strconv.ParseFloat(value, 64)
|
||||||
|
if err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||||
func sampleCPUMaxTemp() float64 {
|
func sampleCPUMaxTemp() float64 {
|
||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||||
@@ -404,11 +520,17 @@ func sampleCPUTempViaSensors() float64 {
|
|||||||
|
|
||||||
// sampleSystemPower reads system power draw via DCMI.
|
// sampleSystemPower reads system power draw via DCMI.
|
||||||
func sampleSystemPower() float64 {
|
func sampleSystemPower() float64 {
|
||||||
|
now := time.Now()
|
||||||
|
current := 0.0
|
||||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return 0
|
current = parseDCMIPowerReading(string(out))
|
||||||
}
|
}
|
||||||
return parseDCMIPowerReading(string(out))
|
systemPowerCacheMu.Lock()
|
||||||
|
defer systemPowerCacheMu.Unlock()
|
||||||
|
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||||
|
systemPowerCache = updated
|
||||||
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
@@ -431,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||||
|
if current > 0 {
|
||||||
|
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||||
|
return current, cache
|
||||||
|
}
|
||||||
|
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||||
|
return cache.Value, cache
|
||||||
|
}
|
||||||
|
return 0, cache
|
||||||
|
}
|
||||||
|
|
||||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||||
// during either load phase.
|
// during either load phase.
|
||||||
func analyzeThrottling(rows []FanStressRow) bool {
|
func analyzeThrottling(rows []FanStressRow) bool {
|
||||||
|
|||||||
67
audit/internal/platform/sat_fan_stress_test.go
Normal file
67
audit/internal/platform/sat_fan_stress_test.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseFanSpeeds(t *testing.T) {
|
||||||
|
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||||
|
got := parseFanSpeeds(raw)
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("fans=%d want 2 (%v)", len(got), got)
|
||||||
|
}
|
||||||
|
if got[0].Name != "FAN1" || got[0].RPM != 2400 {
|
||||||
|
t.Fatalf("fan0=%+v", got[0])
|
||||||
|
}
|
||||||
|
if got[1].Name != "FAN2" || got[1].RPM != 1800 {
|
||||||
|
t.Fatalf("fan1=%+v", got[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFirstFanInputValue(t *testing.T) {
|
||||||
|
feature := map[string]any{
|
||||||
|
"fan1_input": 9200.0,
|
||||||
|
}
|
||||||
|
got, ok := firstFanInputValue(feature)
|
||||||
|
if !ok || got != 9200 {
|
||||||
|
t.Fatalf("got=%v ok=%v", got, ok)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
Instantaneous power reading: 512 Watts
|
||||||
|
Minimum during sampling period: 498 Watts
|
||||||
|
`
|
||||||
|
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||||
|
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||||
|
now := time.Now()
|
||||||
|
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||||
|
|
||||||
|
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||||
|
if got != 480 {
|
||||||
|
t.Fatalf("got=%v want cached 480", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 480 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||||
|
if got != 530 {
|
||||||
|
t.Fatalf("got=%v want 530", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 530 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||||
|
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||||
|
if got != 0 {
|
||||||
|
t.Fatalf("expired cache returned %v want 0", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -30,21 +31,59 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
if len(jobs) != 5 {
|
if len(jobs) != 5 {
|
||||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
}
|
}
|
||||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||||
}
|
}
|
||||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
t.Parallel()
|
||||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
|
||||||
|
|
||||||
|
cfg := amdStressRVSConfig(123)
|
||||||
|
if !strings.Contains(cfg, "module: gst") {
|
||||||
|
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Contains(cfg, "module: mem") {
|
||||||
|
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||||
|
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||||
|
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||||
|
}
|
||||||
|
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||||
|
if !strings.Contains(cfg, field) {
|
||||||
|
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||||
|
if len(jobs) != 4 {
|
||||||
|
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||||
|
}
|
||||||
|
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||||
|
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||||
|
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||||
|
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
got := jobs[4].cmd
|
got := jobs[4].cmd
|
||||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||||
if len(got) != len(want) {
|
if len(got) != len(want) {
|
||||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
}
|
}
|
||||||
@@ -55,6 +94,173 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||||
|
DurationSec: 600,
|
||||||
|
Loader: NvidiaStressLoaderJohn,
|
||||||
|
ExcludeGPUIndices: []int{1},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||||
|
}
|
||||||
|
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||||
|
if len(job.cmd) != len(wantCmd) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||||
|
}
|
||||||
|
for i := range wantCmd {
|
||||||
|
if job.cmd[i] != wantCmd[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||||
|
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||||
|
DurationSec: 120,
|
||||||
|
Loader: NvidiaStressLoaderNCCL,
|
||||||
|
GPUIndices: []int{2, 0},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||||
|
}
|
||||||
|
wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
|
||||||
|
if len(job.cmd) != len(wantCmd) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||||
|
}
|
||||||
|
for i := range wantCmd {
|
||||||
|
if job.cmd[i] != wantCmd[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||||
|
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
got, err := resolveDCGMGPUIndices(nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||||
|
}
|
||||||
|
if want := "0,1,2"; joinIndexList(got) != want {
|
||||||
|
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||||
|
}
|
||||||
|
if want := "1,3"; joinIndexList(got) != want {
|
||||||
|
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
switch file {
|
||||||
|
case "dcgmproftester13":
|
||||||
|
return "/usr/bin/dcgmproftester13", nil
|
||||||
|
default:
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 4 {
|
||||||
|
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != "/usr/bin/dcgmproftester13" {
|
||||||
|
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||||
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
|
||||||
|
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
|
||||||
|
if len(cmd) != len(want) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if cmd[i] != want[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||||
|
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||||
|
if len(env) != 1 {
|
||||||
|
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
|
||||||
|
}
|
||||||
|
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||||
|
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
loader string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||||
|
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||||
|
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||||
|
{loader: "", want: "gpu-nvidia-burn"},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||||
|
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestEnvIntFallback(t *testing.T) {
|
func TestEnvIntFallback(t *testing.T) {
|
||||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||||
@@ -80,8 +286,8 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||||
{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
@@ -130,6 +336,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
if file == "stress-ng" {
|
||||||
|
return "/usr/bin/stress-ng", nil
|
||||||
|
}
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveSATCommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 3 {
|
||||||
|
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != "/usr/bin/stress-ng" {
|
||||||
|
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||||
|
t.Fatalf("error=%q", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||||
|
|||||||
@@ -10,13 +10,30 @@ import (
|
|||||||
func (s *System) ListBeeServices() ([]string, error) {
|
func (s *System) ListBeeServices() ([]string, error) {
|
||||||
seen := map[string]bool{}
|
seen := map[string]bool{}
|
||||||
var out []string
|
var out []string
|
||||||
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
for _, pattern := range []string{
|
||||||
|
"/etc/systemd/system/bee-*.service",
|
||||||
|
"/lib/systemd/system/bee-*.service",
|
||||||
|
"/etc/systemd/system/bee-*.timer",
|
||||||
|
"/lib/systemd/system/bee-*.timer",
|
||||||
|
} {
|
||||||
matches, err := filepath.Glob(pattern)
|
matches, err := filepath.Glob(pattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
base := filepath.Base(match)
|
||||||
|
name := base
|
||||||
|
if strings.HasSuffix(base, ".service") {
|
||||||
|
name = strings.TrimSuffix(base, ".service")
|
||||||
|
}
|
||||||
|
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||||
|
if strings.HasSuffix(name, "@") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
|
||||||
|
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if !seen[name] {
|
if !seen[name] {
|
||||||
seen[name] = true
|
seen[name] = true
|
||||||
out = append(out, name)
|
out = append(out, name)
|
||||||
|
|||||||
@@ -2,12 +2,31 @@ package platform
|
|||||||
|
|
||||||
type System struct{}
|
type System struct{}
|
||||||
|
|
||||||
|
type LiveBootSource struct {
|
||||||
|
InRAM bool `json:"in_ram"`
|
||||||
|
Kind string `json:"kind"`
|
||||||
|
Source string `json:"source,omitempty"`
|
||||||
|
Device string `json:"device,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type InterfaceInfo struct {
|
type InterfaceInfo struct {
|
||||||
Name string
|
Name string
|
||||||
State string
|
State string
|
||||||
IPv4 []string
|
IPv4 []string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NetworkInterfaceSnapshot struct {
|
||||||
|
Name string
|
||||||
|
Up bool
|
||||||
|
IPv4 []string
|
||||||
|
}
|
||||||
|
|
||||||
|
type NetworkSnapshot struct {
|
||||||
|
Interfaces []NetworkInterfaceSnapshot
|
||||||
|
DefaultRoutes []string
|
||||||
|
ResolvConf string
|
||||||
|
}
|
||||||
|
|
||||||
type ServiceAction string
|
type ServiceAction string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@@ -25,12 +44,12 @@ type StaticIPv4Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type RemovableTarget struct {
|
type RemovableTarget struct {
|
||||||
Device string
|
Device string `json:"device"`
|
||||||
FSType string
|
FSType string `json:"fs_type"`
|
||||||
Size string
|
Size string `json:"size"`
|
||||||
Label string
|
Label string `json:"label"`
|
||||||
Model string
|
Model string `json:"model"`
|
||||||
Mountpoint string
|
Mountpoint string `json:"mountpoint"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ToolStatus struct {
|
type ToolStatus struct {
|
||||||
@@ -39,6 +58,20 @@ type ToolStatus struct {
|
|||||||
OK bool
|
OK bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
NvidiaStressLoaderBuiltin = "builtin"
|
||||||
|
NvidiaStressLoaderJohn = "john"
|
||||||
|
NvidiaStressLoaderNCCL = "nccl"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaStressOptions struct {
|
||||||
|
DurationSec int
|
||||||
|
SizeMB int
|
||||||
|
Loader string
|
||||||
|
GPUIndices []int
|
||||||
|
ExcludeGPUIndices []int
|
||||||
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
return &System{}
|
return &System{}
|
||||||
}
|
}
|
||||||
|
|||||||
31
audit/internal/platform/types_test.go
Normal file
31
audit/internal/platform/types_test.go
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
data, err := json.Marshal(RemovableTarget{
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
FSType: "exfat",
|
||||||
|
Size: "1.8T",
|
||||||
|
Label: "USB",
|
||||||
|
Model: "Flash",
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal: %v", err)
|
||||||
|
}
|
||||||
|
raw := string(data)
|
||||||
|
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
|
||||||
|
if !strings.Contains(raw, key) {
|
||||||
|
t.Fatalf("json missing key %s: %s", key, raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
|
||||||
|
t.Fatalf("json still contains Go field names: %s", raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,217 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (m model) updateStaticForm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
switch msg.String() {
|
|
||||||
case "esc":
|
|
||||||
m.screen = screenNetwork
|
|
||||||
m.formFields = nil
|
|
||||||
m.formIndex = 0
|
|
||||||
return m, nil
|
|
||||||
case "up", "shift+tab":
|
|
||||||
if m.formIndex > 0 {
|
|
||||||
m.formIndex--
|
|
||||||
}
|
|
||||||
case "down", "tab":
|
|
||||||
if m.formIndex < len(m.formFields)-1 {
|
|
||||||
m.formIndex++
|
|
||||||
}
|
|
||||||
case "enter":
|
|
||||||
if m.formIndex < len(m.formFields)-1 {
|
|
||||||
m.formIndex++
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
cfg := m.app.ParseStaticIPv4Config(m.selectedIface, []string{
|
|
||||||
m.formFields[0].Value,
|
|
||||||
m.formFields[1].Value,
|
|
||||||
m.formFields[2].Value,
|
|
||||||
m.formFields[3].Value,
|
|
||||||
})
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Static IPv4: " + m.selectedIface
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result, err := m.app.SetStaticIPv4Result(cfg)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
|
||||||
}
|
|
||||||
case "backspace":
|
|
||||||
field := &m.formFields[m.formIndex]
|
|
||||||
if len(field.Value) > 0 {
|
|
||||||
field.Value = field.Value[:len(field.Value)-1]
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
if msg.Type == tea.KeyRunes && len(msg.Runes) > 0 {
|
|
||||||
m.formFields[m.formIndex].Value += string(msg.Runes)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
switch msg.String() {
|
|
||||||
case "left", "up", "tab":
|
|
||||||
if m.cursor > 0 {
|
|
||||||
m.cursor--
|
|
||||||
}
|
|
||||||
case "right", "down":
|
|
||||||
if m.cursor < 1 {
|
|
||||||
m.cursor++
|
|
||||||
}
|
|
||||||
case "esc":
|
|
||||||
m.screen = m.confirmCancelTarget()
|
|
||||||
m.cursor = 0
|
|
||||||
m.pendingAction = actionNone
|
|
||||||
return m, nil
|
|
||||||
case "enter":
|
|
||||||
if m.cursor == 1 { // Cancel
|
|
||||||
m.screen = m.confirmCancelTarget()
|
|
||||||
m.cursor = 0
|
|
||||||
m.pendingAction = actionNone
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
m.busy = true
|
|
||||||
switch m.pendingAction {
|
|
||||||
case actionExportBundle:
|
|
||||||
m.busyTitle = "Export support bundle"
|
|
||||||
target := *m.selectedTarget
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result, err := m.app.ExportSupportBundleResult(target)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
|
|
||||||
}
|
|
||||||
case actionRunAll:
|
|
||||||
return m.executeRunAll()
|
|
||||||
case actionRunMemorySAT:
|
|
||||||
m.busyTitle = "Memory test"
|
|
||||||
m.progressPrefix = "memory"
|
|
||||||
m.progressSince = time.Now()
|
|
||||||
m.progressLines = nil
|
|
||||||
since := m.progressSince
|
|
||||||
return m, tea.Batch(
|
|
||||||
func() tea.Msg {
|
|
||||||
result, err := m.app.RunMemoryAcceptancePackResult("")
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
|
||||||
},
|
|
||||||
pollSATProgress("memory", since),
|
|
||||||
)
|
|
||||||
case actionRunStorageSAT:
|
|
||||||
m.busyTitle = "Storage test"
|
|
||||||
m.progressPrefix = "storage"
|
|
||||||
m.progressSince = time.Now()
|
|
||||||
m.progressLines = nil
|
|
||||||
since := m.progressSince
|
|
||||||
return m, tea.Batch(
|
|
||||||
func() tea.Msg {
|
|
||||||
result, err := m.app.RunStorageAcceptancePackResult("")
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
|
||||||
},
|
|
||||||
pollSATProgress("storage", since),
|
|
||||||
)
|
|
||||||
case actionRunCPUSAT:
|
|
||||||
m.busyTitle = "CPU test"
|
|
||||||
m.progressPrefix = "cpu"
|
|
||||||
m.progressSince = time.Now()
|
|
||||||
m.progressLines = nil
|
|
||||||
since := m.progressSince
|
|
||||||
durationSec := hcCPUDurations[m.hcMode]
|
|
||||||
return m, tea.Batch(
|
|
||||||
func() tea.Msg {
|
|
||||||
result, err := m.app.RunCPUAcceptancePackResult("", durationSec)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
|
||||||
},
|
|
||||||
pollSATProgress("cpu", since),
|
|
||||||
)
|
|
||||||
case actionRunAMDGPUSAT:
|
|
||||||
m.busyTitle = "AMD GPU test"
|
|
||||||
m.progressPrefix = "gpu-amd"
|
|
||||||
m.progressSince = time.Now()
|
|
||||||
m.progressLines = nil
|
|
||||||
since := m.progressSince
|
|
||||||
return m, tea.Batch(
|
|
||||||
func() tea.Msg {
|
|
||||||
result, err := m.app.RunAMDAcceptancePackResult("")
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
|
||||||
},
|
|
||||||
pollSATProgress("gpu-amd", since),
|
|
||||||
)
|
|
||||||
case actionRunFanStress:
|
|
||||||
m.busyTitle = "Fan Stress Test"
|
|
||||||
m.progressPrefix = "fan-stress"
|
|
||||||
m.progressSince = time.Now()
|
|
||||||
m.progressLines = nil
|
|
||||||
since := m.progressSince
|
|
||||||
opts := hcFanStressOpts(m.hcMode, m.app)
|
|
||||||
return m, tea.Batch(
|
|
||||||
func() tea.Msg {
|
|
||||||
ctx := context.Background()
|
|
||||||
result, err := m.app.RunFanStressTestResult(ctx, opts)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
|
||||||
},
|
|
||||||
pollSATProgress("fan-stress", since),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
case "ctrl+c":
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) confirmCancelTarget() screen {
|
|
||||||
switch m.pendingAction {
|
|
||||||
case actionExportBundle:
|
|
||||||
return screenExportTargets
|
|
||||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT, actionRunFanStress:
|
|
||||||
return screenHealthCheck
|
|
||||||
default:
|
|
||||||
return screenMain
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// hcFanStressOpts builds FanStressOptions for the selected mode, auto-detecting all GPUs.
|
|
||||||
func hcFanStressOpts(hcMode int, application interface {
|
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
|
||||||
}) platform.FanStressOptions {
|
|
||||||
// Phase durations per mode: [baseline, load1, pause, load2]
|
|
||||||
type durations struct{ baseline, load1, pause, load2 int }
|
|
||||||
modes := [3]durations{
|
|
||||||
{30, 120, 30, 120}, // Quick: ~5 min total
|
|
||||||
{60, 300, 60, 300}, // Standard: ~12 min total
|
|
||||||
{60, 600, 120, 600}, // Express: ~24 min total
|
|
||||||
}
|
|
||||||
if hcMode < 0 || hcMode >= len(modes) {
|
|
||||||
hcMode = 0
|
|
||||||
}
|
|
||||||
d := modes[hcMode]
|
|
||||||
|
|
||||||
// Use all detected NVIDIA GPUs.
|
|
||||||
var indices []int
|
|
||||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
|
||||||
for _, g := range gpus {
|
|
||||||
indices = append(indices, g.Index)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use minimum GPU memory size to fit all GPUs.
|
|
||||||
sizeMB := 64
|
|
||||||
if gpus, err := application.ListNvidiaGPUs(); err == nil {
|
|
||||||
for _, g := range gpus {
|
|
||||||
if g.MemoryMB > 0 && (sizeMB == 64 || g.MemoryMB < sizeMB) {
|
|
||||||
sizeMB = g.MemoryMB / 16 // allocate 1/16 of VRAM per GPU
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return platform.FanStressOptions{
|
|
||||||
BaselineSec: d.baseline,
|
|
||||||
Phase1DurSec: d.load1,
|
|
||||||
PauseSec: d.pause,
|
|
||||||
Phase2DurSec: d.load2,
|
|
||||||
SizeMB: sizeMB,
|
|
||||||
GPUIndices: indices,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bee/audit/internal/app"
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
)
|
|
||||||
|
|
||||||
type resultMsg struct {
|
|
||||||
title string
|
|
||||||
body string
|
|
||||||
err error
|
|
||||||
back screen
|
|
||||||
}
|
|
||||||
|
|
||||||
type servicesMsg struct {
|
|
||||||
services []string
|
|
||||||
err error
|
|
||||||
}
|
|
||||||
|
|
||||||
type interfacesMsg struct {
|
|
||||||
ifaces []platform.InterfaceInfo
|
|
||||||
err error
|
|
||||||
}
|
|
||||||
|
|
||||||
type exportTargetsMsg struct {
|
|
||||||
targets []platform.RemovableTarget
|
|
||||||
err error
|
|
||||||
}
|
|
||||||
|
|
||||||
type snapshotMsg struct {
|
|
||||||
banner string
|
|
||||||
panel app.HardwarePanelData
|
|
||||||
}
|
|
||||||
|
|
||||||
type nvidiaGPUsMsg struct {
|
|
||||||
gpus []platform.NvidiaGPU
|
|
||||||
err error
|
|
||||||
}
|
|
||||||
|
|
||||||
type nvtopClosedMsg struct{}
|
|
||||||
|
|
||||||
type nvidiaSATDoneMsg struct {
|
|
||||||
title string
|
|
||||||
body string
|
|
||||||
err error
|
|
||||||
}
|
|
||||||
@@ -1,131 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"sort"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
type satProgressMsg struct {
|
|
||||||
lines []string
|
|
||||||
}
|
|
||||||
|
|
||||||
// pollSATProgress returns a Cmd that waits 300ms then reads the latest verbose.log
|
|
||||||
// for the given SAT prefix and returns parsed step progress lines.
|
|
||||||
func pollSATProgress(prefix string, since time.Time) tea.Cmd {
|
|
||||||
return tea.Tick(300*time.Millisecond, func(_ time.Time) tea.Msg {
|
|
||||||
return satProgressMsg{lines: readSATProgressLines(prefix, since)}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func readSATProgressLines(prefix string, since time.Time) []string {
|
|
||||||
pattern := filepath.Join(app.DefaultSATBaseDir, prefix+"-*/verbose.log")
|
|
||||||
matches, err := filepath.Glob(pattern)
|
|
||||||
if err != nil || len(matches) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
sort.Strings(matches)
|
|
||||||
// Find the latest file created at or after (since - 5s) to account for clock skew.
|
|
||||||
cutoff := since.Add(-5 * time.Second)
|
|
||||||
candidate := ""
|
|
||||||
for _, m := range matches {
|
|
||||||
info, statErr := os.Stat(m)
|
|
||||||
if statErr == nil && info.ModTime().After(cutoff) {
|
|
||||||
candidate = m
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if candidate == "" {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
raw, err := os.ReadFile(candidate)
|
|
||||||
if err != nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return parseSATVerboseProgress(string(raw))
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseSATVerboseProgress parses verbose.log content and returns display lines like:
|
|
||||||
//
|
|
||||||
// "PASS lscpu (234ms)"
|
|
||||||
// "FAIL stress-ng (60.0s)"
|
|
||||||
// "... sensors-after"
|
|
||||||
func parseSATVerboseProgress(content string) []string {
|
|
||||||
type step struct {
|
|
||||||
name string
|
|
||||||
rc int
|
|
||||||
durationMs int
|
|
||||||
done bool
|
|
||||||
}
|
|
||||||
|
|
||||||
lines := strings.Split(content, "\n")
|
|
||||||
var steps []step
|
|
||||||
stepIdx := map[string]int{}
|
|
||||||
|
|
||||||
for i, line := range lines {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
if idx := strings.Index(line, "] start "); idx >= 0 {
|
|
||||||
name := strings.TrimSpace(line[idx+len("] start "):])
|
|
||||||
if _, exists := stepIdx[name]; !exists {
|
|
||||||
stepIdx[name] = len(steps)
|
|
||||||
steps = append(steps, step{name: name})
|
|
||||||
}
|
|
||||||
} else if idx := strings.Index(line, "] finish "); idx >= 0 {
|
|
||||||
name := strings.TrimSpace(line[idx+len("] finish "):])
|
|
||||||
si, exists := stepIdx[name]
|
|
||||||
if !exists {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
steps[si].done = true
|
|
||||||
for j := i + 1; j < len(lines) && j <= i+3; j++ {
|
|
||||||
l := strings.TrimSpace(lines[j])
|
|
||||||
if strings.HasPrefix(l, "rc: ") {
|
|
||||||
steps[si].rc, _ = strconv.Atoi(strings.TrimPrefix(l, "rc: "))
|
|
||||||
} else if strings.HasPrefix(l, "duration_ms: ") {
|
|
||||||
steps[si].durationMs, _ = strconv.Atoi(strings.TrimPrefix(l, "duration_ms: "))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var result []string
|
|
||||||
for _, s := range steps {
|
|
||||||
display := cleanSATStepName(s.name)
|
|
||||||
if s.done {
|
|
||||||
status := "PASS"
|
|
||||||
if s.rc != 0 {
|
|
||||||
status = "FAIL"
|
|
||||||
}
|
|
||||||
result = append(result, fmt.Sprintf("%-4s %s (%s)", status, display, fmtDurMs(s.durationMs)))
|
|
||||||
} else {
|
|
||||||
result = append(result, fmt.Sprintf("... %s", display))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
// cleanSATStepName strips leading digits and dash: "01-lscpu.log" → "lscpu".
|
|
||||||
func cleanSATStepName(name string) string {
|
|
||||||
name = strings.TrimSuffix(name, ".log")
|
|
||||||
i := 0
|
|
||||||
for i < len(name) && name[i] >= '0' && name[i] <= '9' {
|
|
||||||
i++
|
|
||||||
}
|
|
||||||
if i < len(name) && name[i] == '-' {
|
|
||||||
name = name[i+1:]
|
|
||||||
}
|
|
||||||
return name
|
|
||||||
}
|
|
||||||
|
|
||||||
func fmtDurMs(ms int) string {
|
|
||||||
if ms < 1000 {
|
|
||||||
return fmt.Sprintf("%dms", ms)
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("%.1fs", float64(ms)/1000)
|
|
||||||
}
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import tea "github.com/charmbracelet/bubbletea"
|
|
||||||
|
|
||||||
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
|
|
||||||
if len(m.targets) == 0 {
|
|
||||||
return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain)
|
|
||||||
}
|
|
||||||
target := m.targets[m.cursor]
|
|
||||||
m.selectedTarget = &target
|
|
||||||
m.pendingAction = actionExportBundle
|
|
||||||
m.screen = screenConfirm
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
@@ -1,327 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Component indices.
|
|
||||||
const (
|
|
||||||
hcGPU = 0
|
|
||||||
hcMemory = 1
|
|
||||||
hcStorage = 2
|
|
||||||
hcCPU = 3
|
|
||||||
)
|
|
||||||
|
|
||||||
// Cursor positions in Health Check screen.
|
|
||||||
const (
|
|
||||||
hcCurGPU = 0
|
|
||||||
hcCurMemory = 1
|
|
||||||
hcCurStorage = 2
|
|
||||||
hcCurCPU = 3
|
|
||||||
hcCurSelectAll = 4
|
|
||||||
hcCurModeQuick = 5
|
|
||||||
hcCurModeStd = 6
|
|
||||||
hcCurModeExpr = 7
|
|
||||||
hcCurRunAll = 8
|
|
||||||
hcCurFanStress = 9
|
|
||||||
hcCurTotal = 10
|
|
||||||
)
|
|
||||||
|
|
||||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
|
||||||
var hcModeDurations = [3]int{600, 3600, 28800}
|
|
||||||
|
|
||||||
// hcCPUDurations maps mode index to CPU stress-ng seconds.
|
|
||||||
var hcCPUDurations = [3]int{60, 300, 900}
|
|
||||||
|
|
||||||
func (m model) enterHealthCheck() (tea.Model, tea.Cmd) {
|
|
||||||
m.screen = screenHealthCheck
|
|
||||||
if !m.hcInitialized {
|
|
||||||
m.hcSel = [4]bool{true, true, true, true}
|
|
||||||
m.hcMode = 0
|
|
||||||
m.hcCursor = 0
|
|
||||||
m.hcInitialized = true
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
switch msg.String() {
|
|
||||||
case "up", "k":
|
|
||||||
if m.hcCursor > 0 {
|
|
||||||
m.hcCursor--
|
|
||||||
}
|
|
||||||
case "down", "j":
|
|
||||||
if m.hcCursor < hcCurTotal-1 {
|
|
||||||
m.hcCursor++
|
|
||||||
}
|
|
||||||
case " ":
|
|
||||||
switch m.hcCursor {
|
|
||||||
case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
|
|
||||||
m.hcSel[m.hcCursor] = !m.hcSel[m.hcCursor]
|
|
||||||
case hcCurSelectAll:
|
|
||||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
||||||
for i := range m.hcSel {
|
|
||||||
m.hcSel[i] = !allOn
|
|
||||||
}
|
|
||||||
case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
|
|
||||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
|
||||||
}
|
|
||||||
case "enter":
|
|
||||||
switch m.hcCursor {
|
|
||||||
case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
|
|
||||||
return m.hcRunSingle(m.hcCursor)
|
|
||||||
case hcCurSelectAll:
|
|
||||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
||||||
for i := range m.hcSel {
|
|
||||||
m.hcSel[i] = !allOn
|
|
||||||
}
|
|
||||||
case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
|
|
||||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
|
||||||
case hcCurRunAll:
|
|
||||||
return m.hcRunAll()
|
|
||||||
case hcCurFanStress:
|
|
||||||
return m.hcRunFanStress()
|
|
||||||
}
|
|
||||||
case "g", "G":
|
|
||||||
return m.hcRunSingle(hcGPU)
|
|
||||||
case "m", "M":
|
|
||||||
return m.hcRunSingle(hcMemory)
|
|
||||||
case "s", "S":
|
|
||||||
return m.hcRunSingle(hcStorage)
|
|
||||||
case "c", "C":
|
|
||||||
return m.hcRunSingle(hcCPU)
|
|
||||||
case "r", "R":
|
|
||||||
return m.hcRunAll()
|
|
||||||
case "f", "F":
|
|
||||||
return m.hcRunFanStress()
|
|
||||||
case "a", "A":
|
|
||||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
||||||
for i := range m.hcSel {
|
|
||||||
m.hcSel[i] = !allOn
|
|
||||||
}
|
|
||||||
case "1":
|
|
||||||
m.hcMode = 0
|
|
||||||
case "2":
|
|
||||||
m.hcMode = 1
|
|
||||||
case "3":
|
|
||||||
m.hcMode = 2
|
|
||||||
case "esc":
|
|
||||||
m.screen = screenMain
|
|
||||||
m.cursor = 0
|
|
||||||
case "q", "ctrl+c":
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
|
|
||||||
switch idx {
|
|
||||||
case hcGPU:
|
|
||||||
if m.app.DetectGPUVendor() == "amd" {
|
|
||||||
m.pendingAction = actionRunAMDGPUSAT
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
m.nvidiaDurIdx = m.hcMode
|
|
||||||
return m.enterNvidiaSATSetup()
|
|
||||||
case hcMemory:
|
|
||||||
m.pendingAction = actionRunMemorySAT
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
case hcStorage:
|
|
||||||
m.pendingAction = actionRunStorageSAT
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
case hcCPU:
|
|
||||||
m.pendingAction = actionRunCPUSAT
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) hcRunFanStress() (tea.Model, tea.Cmd) {
|
|
||||||
m.pendingAction = actionRunFanStress
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
|
||||||
for _, sel := range m.hcSel {
|
|
||||||
if sel {
|
|
||||||
m.pendingAction = actionRunAll
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
|
||||||
durationSec := hcModeDurations[m.hcMode]
|
|
||||||
durationIdx := m.hcMode
|
|
||||||
sel := m.hcSel
|
|
||||||
app := m.app
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Health Check"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
var parts []string
|
|
||||||
if sel[hcGPU] {
|
|
||||||
vendor := app.DetectGPUVendor()
|
|
||||||
if vendor == "amd" {
|
|
||||||
r, err := app.RunAMDAcceptancePackResult("")
|
|
||||||
body := r.Body
|
|
||||||
if err != nil {
|
|
||||||
body += "\nERROR: " + err.Error()
|
|
||||||
}
|
|
||||||
parts = append(parts, "=== GPU (AMD) ===\n"+body)
|
|
||||||
} else {
|
|
||||||
gpus, err := app.ListNvidiaGPUs()
|
|
||||||
if err != nil || len(gpus) == 0 {
|
|
||||||
parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.")
|
|
||||||
} else {
|
|
||||||
var indices []int
|
|
||||||
sizeMB := 0
|
|
||||||
for _, g := range gpus {
|
|
||||||
indices = append(indices, g.Index)
|
|
||||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
|
||||||
sizeMB = g.MemoryMB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if sizeMB == 0 {
|
|
||||||
sizeMB = 64
|
|
||||||
}
|
|
||||||
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices)
|
|
||||||
body := r.Body
|
|
||||||
if err != nil {
|
|
||||||
body += "\nERROR: " + err.Error()
|
|
||||||
}
|
|
||||||
parts = append(parts, "=== GPU ===\n"+body)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if sel[hcMemory] {
|
|
||||||
r, err := app.RunMemoryAcceptancePackResult("")
|
|
||||||
body := r.Body
|
|
||||||
if err != nil {
|
|
||||||
body += "\nERROR: " + err.Error()
|
|
||||||
}
|
|
||||||
parts = append(parts, "=== MEMORY ===\n"+body)
|
|
||||||
}
|
|
||||||
if sel[hcStorage] {
|
|
||||||
r, err := app.RunStorageAcceptancePackResult("")
|
|
||||||
body := r.Body
|
|
||||||
if err != nil {
|
|
||||||
body += "\nERROR: " + err.Error()
|
|
||||||
}
|
|
||||||
parts = append(parts, "=== STORAGE ===\n"+body)
|
|
||||||
}
|
|
||||||
if sel[hcCPU] {
|
|
||||||
cpuDur := hcCPUDurations[durationIdx]
|
|
||||||
r, err := app.RunCPUAcceptancePackResult("", cpuDur)
|
|
||||||
body := r.Body
|
|
||||||
if err != nil {
|
|
||||||
body += "\nERROR: " + err.Error()
|
|
||||||
}
|
|
||||||
parts = append(parts, "=== CPU ===\n"+body)
|
|
||||||
}
|
|
||||||
combined := strings.Join(parts, "\n\n")
|
|
||||||
if combined == "" {
|
|
||||||
combined = "No components selected."
|
|
||||||
}
|
|
||||||
return resultMsg{title: "Health Check", body: combined, back: screenHealthCheck}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderHealthCheck(m model) string {
|
|
||||||
var b strings.Builder
|
|
||||||
|
|
||||||
fmt.Fprintln(&b, "HEALTH CHECK")
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
fmt.Fprintln(&b, " Diagnostics:")
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
|
|
||||||
type comp struct{ name, desc, key string }
|
|
||||||
comps := []comp{
|
|
||||||
{"GPU", "nvidia/amd auto-detect", "G"},
|
|
||||||
{"MEMORY", "memtester", "M"},
|
|
||||||
{"STORAGE", "smartctl + NVMe self-test", "S"},
|
|
||||||
{"CPU", "audit diagnostics", "C"},
|
|
||||||
}
|
|
||||||
for i, c := range comps {
|
|
||||||
pfx := " "
|
|
||||||
if m.hcCursor == i {
|
|
||||||
pfx = "> "
|
|
||||||
}
|
|
||||||
ch := "[ ]"
|
|
||||||
if m.hcSel[i] {
|
|
||||||
ch = "[x]"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s%s %-8s %-28s [%s]\n", pfx, ch, c.name, c.desc, c.key)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Fprintln(&b, " ─────────────────────────────────────────────────")
|
|
||||||
{
|
|
||||||
pfx := " "
|
|
||||||
if m.hcCursor == hcCurSelectAll {
|
|
||||||
pfx = "> "
|
|
||||||
}
|
|
||||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
|
||||||
ch := "[ ]"
|
|
||||||
if allOn {
|
|
||||||
ch = "[x]"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s%s Select / Deselect All [A]\n", pfx, ch)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
fmt.Fprintln(&b, " Mode:")
|
|
||||||
modes := []struct{ label, key string }{
|
|
||||||
{"Quick", "1"},
|
|
||||||
{"Standard", "2"},
|
|
||||||
{"Express", "3"},
|
|
||||||
}
|
|
||||||
for i, mode := range modes {
|
|
||||||
pfx := " "
|
|
||||||
if m.hcCursor == hcCurModeQuick+i {
|
|
||||||
pfx = "> "
|
|
||||||
}
|
|
||||||
radio := "( )"
|
|
||||||
if m.hcMode == i {
|
|
||||||
radio = "(*)"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
{
|
|
||||||
pfx := " "
|
|
||||||
if m.hcCursor == hcCurRunAll {
|
|
||||||
pfx = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
pfx := " "
|
|
||||||
if m.hcCursor == hcCurFanStress {
|
|
||||||
pfx = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s[ FAN STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
|
||||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [F] fan stress [Esc] back")
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
|
|
||||||
switch m.cursor {
|
|
||||||
case 0: // Health Check
|
|
||||||
return m.enterHealthCheck()
|
|
||||||
case 1: // Export support bundle
|
|
||||||
m.pendingAction = actionExportBundle
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Export support bundle"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
targets, err := m.app.ListRemovableTargets()
|
|
||||||
return exportTargetsMsg{targets: targets, err: err}
|
|
||||||
}
|
|
||||||
case 2: // Settings
|
|
||||||
m.screen = screenSettings
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
case 3: // Exit
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
|
|
||||||
switch m.cursor {
|
|
||||||
case 0:
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Network status"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result, err := m.app.NetworkStatus()
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
|
||||||
}
|
|
||||||
case 1:
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "DHCP all interfaces"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result, err := m.app.DHCPAllResult()
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
|
||||||
}
|
|
||||||
case 2:
|
|
||||||
m.pendingAction = actionDHCPOne
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Interfaces"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
ifaces, err := m.app.ListInterfaces()
|
|
||||||
return interfacesMsg{ifaces: ifaces, err: err}
|
|
||||||
}
|
|
||||||
case 3:
|
|
||||||
m.pendingAction = actionStaticIPv4
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Interfaces"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
ifaces, err := m.app.ListInterfaces()
|
|
||||||
return interfacesMsg{ifaces: ifaces, err: err}
|
|
||||||
}
|
|
||||||
case 4:
|
|
||||||
m.screen = screenSettings
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) handleInterfacePickMenu() (tea.Model, tea.Cmd) {
|
|
||||||
if len(m.interfaces) == 0 {
|
|
||||||
return m, resultCmd("interfaces", "No physical interfaces found", nil, screenNetwork)
|
|
||||||
}
|
|
||||||
m.selectedIface = m.interfaces[m.cursor].Name
|
|
||||||
switch m.pendingAction {
|
|
||||||
case actionDHCPOne:
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "DHCP on " + m.selectedIface
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result, err := m.app.DHCPOneResult(m.selectedIface)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
|
||||||
}
|
|
||||||
case actionStaticIPv4:
|
|
||||||
defaults := m.app.DefaultStaticIPv4FormFields(m.selectedIface)
|
|
||||||
m.formFields = []formField{
|
|
||||||
{Label: "IPv4 address", Value: defaults[0]},
|
|
||||||
{Label: "Prefix", Value: defaults[1]},
|
|
||||||
{Label: "Gateway", Value: strings.TrimSpace(defaults[2])},
|
|
||||||
{Label: "DNS (space-separated)", Value: defaults[3]},
|
|
||||||
}
|
|
||||||
m.formIndex = 0
|
|
||||||
m.screen = screenStaticForm
|
|
||||||
return m, nil
|
|
||||||
default:
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,238 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"os/exec"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
var nvidiaDurationOptions = []struct {
|
|
||||||
label string
|
|
||||||
seconds int
|
|
||||||
}{
|
|
||||||
{"10 minutes", 600},
|
|
||||||
{"1 hour", 3600},
|
|
||||||
{"8 hours", 28800},
|
|
||||||
{"24 hours", 86400},
|
|
||||||
}
|
|
||||||
|
|
||||||
// enterNvidiaSATSetup resets the setup screen and starts loading GPU list.
|
|
||||||
func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
|
|
||||||
m.screen = screenNvidiaSATSetup
|
|
||||||
m.nvidiaGPUs = nil
|
|
||||||
m.nvidiaGPUSel = nil
|
|
||||||
m.nvidiaDurIdx = 0
|
|
||||||
m.nvidiaSATCursor = 0
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "NVIDIA SAT"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
gpus, err := m.app.ListNvidiaGPUs()
|
|
||||||
return nvidiaGPUsMsg{gpus: gpus, err: err}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// handleNvidiaGPUsMsg processes the GPU list response.
|
|
||||||
func (m model) handleNvidiaGPUsMsg(msg nvidiaGPUsMsg) (tea.Model, tea.Cmd) {
|
|
||||||
m.busy = false
|
|
||||||
m.busyTitle = ""
|
|
||||||
if msg.err != nil {
|
|
||||||
m.title = "NVIDIA SAT"
|
|
||||||
m.body = fmt.Sprintf("Failed to list GPUs: %v", msg.err)
|
|
||||||
m.prevScreen = screenHealthCheck
|
|
||||||
m.screen = screenOutput
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
m.nvidiaGPUs = msg.gpus
|
|
||||||
m.nvidiaGPUSel = make([]bool, len(msg.gpus))
|
|
||||||
for i := range m.nvidiaGPUSel {
|
|
||||||
m.nvidiaGPUSel[i] = true // all selected by default
|
|
||||||
}
|
|
||||||
m.nvidiaSATCursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// updateNvidiaSATSetup handles keys on the setup screen.
|
|
||||||
func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
numDur := len(nvidiaDurationOptions)
|
|
||||||
numGPU := len(m.nvidiaGPUs)
|
|
||||||
totalItems := numDur + numGPU + 2 // +2: Start, Cancel
|
|
||||||
switch msg.String() {
|
|
||||||
case "up", "k":
|
|
||||||
if m.nvidiaSATCursor > 0 {
|
|
||||||
m.nvidiaSATCursor--
|
|
||||||
}
|
|
||||||
case "down", "j":
|
|
||||||
if m.nvidiaSATCursor < totalItems-1 {
|
|
||||||
m.nvidiaSATCursor++
|
|
||||||
}
|
|
||||||
case " ":
|
|
||||||
switch {
|
|
||||||
case m.nvidiaSATCursor < numDur:
|
|
||||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
|
||||||
case m.nvidiaSATCursor < numDur+numGPU:
|
|
||||||
i := m.nvidiaSATCursor - numDur
|
|
||||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
|
||||||
}
|
|
||||||
case "enter":
|
|
||||||
startIdx := numDur + numGPU
|
|
||||||
cancelIdx := startIdx + 1
|
|
||||||
switch {
|
|
||||||
case m.nvidiaSATCursor < numDur:
|
|
||||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
|
||||||
case m.nvidiaSATCursor < startIdx:
|
|
||||||
i := m.nvidiaSATCursor - numDur
|
|
||||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
|
||||||
case m.nvidiaSATCursor == startIdx:
|
|
||||||
return m.startNvidiaSAT()
|
|
||||||
case m.nvidiaSATCursor == cancelIdx:
|
|
||||||
m.screen = screenHealthCheck
|
|
||||||
m.cursor = 0
|
|
||||||
}
|
|
||||||
case "esc":
|
|
||||||
m.screen = screenHealthCheck
|
|
||||||
m.cursor = 0
|
|
||||||
case "ctrl+c", "q":
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// startNvidiaSAT launches the SAT and nvtop.
|
|
||||||
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
|
||||||
var selectedGPUs []platform.NvidiaGPU
|
|
||||||
for i, sel := range m.nvidiaGPUSel {
|
|
||||||
if sel {
|
|
||||||
selectedGPUs = append(selectedGPUs, m.nvidiaGPUs[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(selectedGPUs) == 0 {
|
|
||||||
selectedGPUs = m.nvidiaGPUs // fallback: use all if none explicitly selected
|
|
||||||
}
|
|
||||||
|
|
||||||
sizeMB := 0
|
|
||||||
for _, g := range selectedGPUs {
|
|
||||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
|
||||||
sizeMB = g.MemoryMB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if sizeMB == 0 {
|
|
||||||
sizeMB = 64
|
|
||||||
}
|
|
||||||
|
|
||||||
var gpuIndices []int
|
|
||||||
for _, g := range selectedGPUs {
|
|
||||||
gpuIndices = append(gpuIndices, g.Index)
|
|
||||||
}
|
|
||||||
|
|
||||||
durationSec := nvidiaDurationOptions[m.nvidiaDurIdx].seconds
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
m.nvidiaSATCancel = cancel
|
|
||||||
m.nvidiaSATAborted = false
|
|
||||||
m.screen = screenNvidiaSATRunning
|
|
||||||
m.nvidiaSATCursor = 0
|
|
||||||
|
|
||||||
satCmd := func() tea.Msg {
|
|
||||||
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", durationSec, sizeMB, gpuIndices)
|
|
||||||
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
|
|
||||||
}
|
|
||||||
|
|
||||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
|
||||||
if lookErr != nil {
|
|
||||||
// nvtop not available: just run the SAT, show running screen
|
|
||||||
return m, satCmd
|
|
||||||
}
|
|
||||||
|
|
||||||
return m, tea.Batch(
|
|
||||||
satCmd,
|
|
||||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
|
||||||
return nvtopClosedMsg{}
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// updateNvidiaSATRunning handles keys on the running screen.
|
|
||||||
func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
switch msg.String() {
|
|
||||||
case "o", "O":
|
|
||||||
nvtopPath, err := exec.LookPath("nvtop")
|
|
||||||
if err != nil {
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
|
||||||
return nvtopClosedMsg{}
|
|
||||||
})
|
|
||||||
case "a", "A":
|
|
||||||
if m.nvidiaSATCancel != nil {
|
|
||||||
m.nvidiaSATCancel()
|
|
||||||
m.nvidiaSATCancel = nil
|
|
||||||
}
|
|
||||||
m.nvidiaSATAborted = true
|
|
||||||
m.screen = screenHealthCheck
|
|
||||||
m.cursor = 0
|
|
||||||
case "ctrl+c":
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// renderNvidiaSATSetup renders the setup screen.
|
|
||||||
func renderNvidiaSATSetup(m model) string {
|
|
||||||
var b strings.Builder
|
|
||||||
fmt.Fprintln(&b, "NVIDIA SAT")
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
fmt.Fprintln(&b, "Duration:")
|
|
||||||
for i, opt := range nvidiaDurationOptions {
|
|
||||||
radio := "( )"
|
|
||||||
if i == m.nvidiaDurIdx {
|
|
||||||
radio = "(*)"
|
|
||||||
}
|
|
||||||
prefix := " "
|
|
||||||
if m.nvidiaSATCursor == i {
|
|
||||||
prefix = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s%s %s\n", prefix, radio, opt.label)
|
|
||||||
}
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
if len(m.nvidiaGPUs) == 0 {
|
|
||||||
fmt.Fprintln(&b, "GPUs: (none detected)")
|
|
||||||
} else {
|
|
||||||
fmt.Fprintln(&b, "GPUs:")
|
|
||||||
for i, gpu := range m.nvidiaGPUs {
|
|
||||||
check := "[ ]"
|
|
||||||
if m.nvidiaGPUSel[i] {
|
|
||||||
check = "[x]"
|
|
||||||
}
|
|
||||||
prefix := " "
|
|
||||||
if m.nvidiaSATCursor == len(nvidiaDurationOptions)+i {
|
|
||||||
prefix = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%s%s %d: %s (%d MB)\n", prefix, check, gpu.Index, gpu.Name, gpu.MemoryMB)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
startIdx := len(nvidiaDurationOptions) + len(m.nvidiaGPUs)
|
|
||||||
startPfx := " "
|
|
||||||
cancelPfx := " "
|
|
||||||
if m.nvidiaSATCursor == startIdx {
|
|
||||||
startPfx = "> "
|
|
||||||
}
|
|
||||||
if m.nvidiaSATCursor == startIdx+1 {
|
|
||||||
cancelPfx = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "%sStart\n", startPfx)
|
|
||||||
fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
|
|
||||||
fmt.Fprintln(&b)
|
|
||||||
b.WriteString("[↑/↓] move [space] toggle [enter] select [esc] cancel\n")
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
// renderNvidiaSATRunning renders the running screen.
|
|
||||||
func renderNvidiaSATRunning() string {
|
|
||||||
return "NVIDIA SAT\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
|
||||||
}
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (m model) handleServicesMenu() (tea.Model, tea.Cmd) {
|
|
||||||
if len(m.services) == 0 {
|
|
||||||
return m, resultCmd("Services", "No bee-* services found.", nil, screenSettings)
|
|
||||||
}
|
|
||||||
m.selectedService = m.services[m.cursor]
|
|
||||||
m.screen = screenServiceAction
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) handleServiceActionMenu() (tea.Model, tea.Cmd) {
|
|
||||||
action := m.serviceMenu[m.cursor]
|
|
||||||
if action == "back" {
|
|
||||||
m.screen = screenServices
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "service: " + m.selectedService
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
switch action {
|
|
||||||
case "Status":
|
|
||||||
result, err := m.app.ServiceStatusResult(m.selectedService)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
|
||||||
case "Restart":
|
|
||||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceRestart)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
|
||||||
case "Start":
|
|
||||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStart)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
|
||||||
case "Stop":
|
|
||||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStop)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
|
||||||
default:
|
|
||||||
return resultMsg{title: "Service", body: "Unknown action.", back: screenServiceAction}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import tea "github.com/charmbracelet/bubbletea"
|
|
||||||
|
|
||||||
func (m model) handleSettingsMenu() (tea.Model, tea.Cmd) {
|
|
||||||
switch m.cursor {
|
|
||||||
case 0: // Network
|
|
||||||
m.screen = screenNetwork
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
case 1: // Services
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Services"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
services, err := m.app.ListBeeServices()
|
|
||||||
return servicesMsg{services: services, err: err}
|
|
||||||
}
|
|
||||||
case 2: // Re-run audit
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Re-run audit"
|
|
||||||
runtimeMode := m.runtimeMode
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result, err := m.app.RunAuditNow(runtimeMode)
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenSettings}
|
|
||||||
}
|
|
||||||
case 3: // Run self-check
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Self-check"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result, err := m.app.RunRuntimePreflightResult()
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenSettings}
|
|
||||||
}
|
|
||||||
case 4: // Runtime issues
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Runtime issues"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result := m.app.RuntimeHealthResult()
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
|
|
||||||
}
|
|
||||||
case 5: // Audit logs
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Audit logs"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result := m.app.AuditLogTailResult()
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
|
|
||||||
}
|
|
||||||
case 6: // Check tools
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Check tools"
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
result := m.app.ToolCheckResult([]string{
|
|
||||||
"dmidecode", "smartctl", "nvme", "ipmitool", "lspci",
|
|
||||||
"ethtool", "bee", "nvidia-smi", "bee-gpu-stress",
|
|
||||||
"memtester", "dhclient", "lsblk", "mount",
|
|
||||||
})
|
|
||||||
return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
|
|
||||||
}
|
|
||||||
case 7: // Back
|
|
||||||
m.screen = screenMain
|
|
||||||
m.cursor = 0
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
@@ -1,30 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bee/audit/internal/app"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (m model) refreshSnapshotCmd() tea.Cmd {
|
|
||||||
if m.app == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return func() tea.Msg {
|
|
||||||
return snapshotMsg{
|
|
||||||
banner: m.app.MainBanner(),
|
|
||||||
panel: m.app.LoadHardwarePanel(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func shouldRefreshSnapshot(prev, next model) bool {
|
|
||||||
return prev.screen != next.screen || prev.busy != next.busy
|
|
||||||
}
|
|
||||||
|
|
||||||
func emptySnapshot() snapshotMsg {
|
|
||||||
return snapshotMsg{
|
|
||||||
banner: "",
|
|
||||||
panel: app.HardwarePanelData{},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,628 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
"bee/audit/internal/runtimeenv"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
func newTestModel() model {
|
|
||||||
return newModel(app.New(platform.New()), runtimeenv.ModeLocal)
|
|
||||||
}
|
|
||||||
|
|
||||||
func sendKey(t *testing.T, m model, key tea.KeyType) model {
|
|
||||||
t.Helper()
|
|
||||||
|
|
||||||
next, _ := m.Update(tea.KeyMsg{Type: key})
|
|
||||||
return next.(model)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateMainMenuCursorNavigation(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
|
|
||||||
m = sendKey(t, m, tea.KeyDown)
|
|
||||||
if m.cursor != 1 {
|
|
||||||
t.Fatalf("cursor=%d want 1 after down", m.cursor)
|
|
||||||
}
|
|
||||||
|
|
||||||
m = sendKey(t, m, tea.KeyDown)
|
|
||||||
if m.cursor != 2 {
|
|
||||||
t.Fatalf("cursor=%d want 2 after second down", m.cursor)
|
|
||||||
}
|
|
||||||
|
|
||||||
m = sendKey(t, m, tea.KeyUp)
|
|
||||||
if m.cursor != 1 {
|
|
||||||
t.Fatalf("cursor=%d want 1 after up", m.cursor)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateMainMenuEnterActions(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
cursor int
|
|
||||||
wantScreen screen
|
|
||||||
wantBusy bool
|
|
||||||
wantCmd bool
|
|
||||||
}{
|
|
||||||
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck, wantCmd: true},
|
|
||||||
{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
|
||||||
{name: "settings", cursor: 2, wantScreen: screenSettings, wantCmd: true},
|
|
||||||
{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, test := range tests {
|
|
||||||
test := test
|
|
||||||
t.Run(test.name, func(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.cursor = test.cursor
|
|
||||||
|
|
||||||
next, cmd := m.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if got.screen != test.wantScreen {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
|
||||||
}
|
|
||||||
if got.busy != test.wantBusy {
|
|
||||||
t.Fatalf("busy=%v want %v", got.busy, test.wantBusy)
|
|
||||||
}
|
|
||||||
if (cmd != nil) != test.wantCmd {
|
|
||||||
t.Fatalf("cmd present=%v want %v", cmd != nil, test.wantCmd)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUpdateConfirmCancelViaKeys(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.pendingAction = actionRunMemorySAT
|
|
||||||
|
|
||||||
next, _ := m.Update(tea.KeyMsg{Type: tea.KeyRight})
|
|
||||||
got := next.(model)
|
|
||||||
if got.cursor != 1 {
|
|
||||||
t.Fatalf("cursor=%d want 1 after right", got.cursor)
|
|
||||||
}
|
|
||||||
|
|
||||||
next, _ = got.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
|
||||||
got = next.(model)
|
|
||||||
if got.screen != screenHealthCheck {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenHealthCheck)
|
|
||||||
}
|
|
||||||
if got.cursor != 0 {
|
|
||||||
t.Fatalf("cursor=%d want 0 after cancel", got.cursor)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestMainMenuSimpleTransitions(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
cursor int
|
|
||||||
wantScreen screen
|
|
||||||
}{
|
|
||||||
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
|
|
||||||
{name: "settings", cursor: 2, wantScreen: screenSettings},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, test := range tests {
|
|
||||||
test := test
|
|
||||||
t.Run(test.name, func(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.cursor = test.cursor
|
|
||||||
|
|
||||||
next, cmd := m.handleMainMenu()
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if cmd != nil {
|
|
||||||
t.Fatalf("expected nil cmd for %s", test.name)
|
|
||||||
}
|
|
||||||
if got.screen != test.wantScreen {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
|
||||||
}
|
|
||||||
if got.cursor != 0 {
|
|
||||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestMainMenuExportSetsBusy(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.cursor = 1 // Export support bundle
|
|
||||||
|
|
||||||
next, cmd := m.handleMainMenu()
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if !got.busy {
|
|
||||||
t.Fatal("busy=false for export")
|
|
||||||
}
|
|
||||||
if cmd == nil {
|
|
||||||
t.Fatal("expected async cmd for export")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestMainViewRendersTwoColumns(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.cursor = 1
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
for _, want := range []string{
|
|
||||||
"bee",
|
|
||||||
"Health Check",
|
|
||||||
"> Export support bundle",
|
|
||||||
"Settings",
|
|
||||||
"Exit",
|
|
||||||
"│",
|
|
||||||
"[↑↓] move",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(view, want) {
|
|
||||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestEscapeNavigation(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
screen screen
|
|
||||||
wantScreen screen
|
|
||||||
}{
|
|
||||||
{name: "network to settings", screen: screenNetwork, wantScreen: screenSettings},
|
|
||||||
{name: "services to settings", screen: screenServices, wantScreen: screenSettings},
|
|
||||||
{name: "settings to main", screen: screenSettings, wantScreen: screenMain},
|
|
||||||
{name: "service action to services", screen: screenServiceAction, wantScreen: screenServices},
|
|
||||||
{name: "export targets to main", screen: screenExportTargets, wantScreen: screenMain},
|
|
||||||
{name: "interface pick to network", screen: screenInterfacePick, wantScreen: screenNetwork},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, test := range tests {
|
|
||||||
test := test
|
|
||||||
t.Run(test.name, func(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = test.screen
|
|
||||||
m.cursor = 3
|
|
||||||
|
|
||||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEsc})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if got.screen != test.wantScreen {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
|
||||||
}
|
|
||||||
if got.cursor != 0 {
|
|
||||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHealthCheckEscReturnsToMain(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenHealthCheck
|
|
||||||
m.hcCursor = 3
|
|
||||||
|
|
||||||
next, _ := m.updateHealthCheck(tea.KeyMsg{Type: tea.KeyEsc})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if got.screen != screenMain {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenMain)
|
|
||||||
}
|
|
||||||
if got.cursor != 0 {
|
|
||||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestOutputScreenReturnsToPreviousScreen(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenOutput
|
|
||||||
m.prevScreen = screenNetwork
|
|
||||||
m.title = "title"
|
|
||||||
m.body = "body"
|
|
||||||
|
|
||||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEnter})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if got.screen != screenNetwork {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenNetwork)
|
|
||||||
}
|
|
||||||
if got.title != "" || got.body != "" {
|
|
||||||
t.Fatalf("expected output state cleared, got title=%q body=%q", got.title, got.body)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenHealthCheck
|
|
||||||
m.hcInitialized = true
|
|
||||||
m.hcSel = [4]bool{true, true, true, true}
|
|
||||||
|
|
||||||
next, cmd := m.hcRunSingle(hcGPU)
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if cmd == nil {
|
|
||||||
t.Fatal("expected non-nil cmd (GPU list loader)")
|
|
||||||
}
|
|
||||||
if got.screen != screenNvidiaSATSetup {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
|
|
||||||
}
|
|
||||||
|
|
||||||
// esc from setup returns to health check
|
|
||||||
next, _ = got.updateNvidiaSATSetup(tea.KeyMsg{Type: tea.KeyEsc})
|
|
||||||
got = next.(model)
|
|
||||||
if got.screen != screenHealthCheck {
|
|
||||||
t.Fatalf("screen after esc=%q want %q", got.screen, screenHealthCheck)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHealthCheckRunSingleMapsActions(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tests := []struct {
|
|
||||||
idx int
|
|
||||||
want actionKind
|
|
||||||
}{
|
|
||||||
{idx: hcMemory, want: actionRunMemorySAT},
|
|
||||||
{idx: hcStorage, want: actionRunStorageSAT},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, test := range tests {
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenHealthCheck
|
|
||||||
m.hcInitialized = true
|
|
||||||
|
|
||||||
next, _ := m.hcRunSingle(test.idx)
|
|
||||||
got := next.(model)
|
|
||||||
if got.pendingAction != test.want {
|
|
||||||
t.Fatalf("idx=%d pendingAction=%q want %q", test.idx, got.pendingAction, test.want)
|
|
||||||
}
|
|
||||||
if got.screen != screenConfirm {
|
|
||||||
t.Fatalf("idx=%d screen=%q want %q", test.idx, got.screen, screenConfirm)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExportTargetSelectionOpensConfirm(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenExportTargets
|
|
||||||
m.targets = []platform.RemovableTarget{{Device: "/dev/sdb1", FSType: "vfat", Size: "16G"}}
|
|
||||||
|
|
||||||
next, cmd := m.handleExportTargetsMenu()
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if cmd != nil {
|
|
||||||
t.Fatal("expected nil cmd")
|
|
||||||
}
|
|
||||||
if got.screen != screenConfirm {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
|
||||||
}
|
|
||||||
if got.pendingAction != actionExportBundle {
|
|
||||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionExportBundle)
|
|
||||||
}
|
|
||||||
if got.selectedTarget == nil || got.selectedTarget.Device != "/dev/sdb1" {
|
|
||||||
t.Fatalf("selectedTarget=%+v want /dev/sdb1", got.selectedTarget)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestInterfacePickStaticIPv4OpensForm(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.pendingAction = actionStaticIPv4
|
|
||||||
m.interfaces = []platform.InterfaceInfo{{Name: "eth0"}}
|
|
||||||
|
|
||||||
next, cmd := m.handleInterfacePickMenu()
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if cmd != nil {
|
|
||||||
t.Fatal("expected nil cmd")
|
|
||||||
}
|
|
||||||
if got.screen != screenStaticForm {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenStaticForm)
|
|
||||||
}
|
|
||||||
if got.selectedIface != "eth0" {
|
|
||||||
t.Fatalf("selectedIface=%q want eth0", got.selectedIface)
|
|
||||||
}
|
|
||||||
if len(got.formFields) != 4 {
|
|
||||||
t.Fatalf("len(formFields)=%d want 4", len(got.formFields))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestResultMsgUsesExplicitBackScreen(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenConfirm
|
|
||||||
|
|
||||||
next, _ := m.Update(resultMsg{title: "done", body: "ok", back: screenNetwork})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if got.screen != screenOutput {
|
|
||||||
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
|
|
||||||
}
|
|
||||||
if got.prevScreen != screenNetwork {
|
|
||||||
t.Fatalf("prevScreen=%q want %q", got.prevScreen, screenNetwork)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestConfirmCancelTarget(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
|
|
||||||
m.pendingAction = actionExportBundle
|
|
||||||
if got := m.confirmCancelTarget(); got != screenExportTargets {
|
|
||||||
t.Fatalf("export cancel target=%q want %q", got, screenExportTargets)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.pendingAction = actionRunAll
|
|
||||||
if got := m.confirmCancelTarget(); got != screenHealthCheck {
|
|
||||||
t.Fatalf("run all cancel target=%q want %q", got, screenHealthCheck)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.pendingAction = actionRunMemorySAT
|
|
||||||
if got := m.confirmCancelTarget(); got != screenHealthCheck {
|
|
||||||
t.Fatalf("memory sat cancel target=%q want %q", got, screenHealthCheck)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.pendingAction = actionRunStorageSAT
|
|
||||||
if got := m.confirmCancelTarget(); got != screenHealthCheck {
|
|
||||||
t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.pendingAction = actionNone
|
|
||||||
if got := m.confirmCancelTarget(); got != screenMain {
|
|
||||||
t.Fatalf("default cancel target=%q want %q", got, screenMain)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestViewBusyStateIsMinimal(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.busy = true
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
want := "bee\n\nWorking...\n\n[ctrl+c] quit\n"
|
|
||||||
if view != want {
|
|
||||||
t.Fatalf("busy view mismatch\nwant:\n%s\ngot:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestViewBusyStateUsesBusyTitle(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Export support bundle"
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
|
|
||||||
for _, want := range []string{
|
|
||||||
"Export support bundle",
|
|
||||||
"Working...",
|
|
||||||
"[ctrl+c] quit",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(view, want) {
|
|
||||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenOutput
|
|
||||||
m.title = "Run audit"
|
|
||||||
m.body = "audit output: /appdata/bee/export/bee-audit.json\n"
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
|
|
||||||
for _, want := range []string{
|
|
||||||
"Run audit",
|
|
||||||
"audit output: /appdata/bee/export/bee-audit.json",
|
|
||||||
"[enter/esc] back [ctrl+c] quit",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(view, want) {
|
|
||||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestViewRendersBannerModuleAboveScreenBody(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.banner = "System: Demo Server\nIP: 10.0.0.10"
|
|
||||||
m.width = 60
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
|
|
||||||
for _, want := range []string{
|
|
||||||
"┌ MOTD ",
|
|
||||||
"System: Demo Server",
|
|
||||||
"IP: 10.0.0.10",
|
|
||||||
"Health Check",
|
|
||||||
"Export support bundle",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(view, want) {
|
|
||||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSnapshotMsgUpdatesBannerAndPanel(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
|
|
||||||
next, cmd := m.Update(snapshotMsg{
|
|
||||||
banner: "System: Demo",
|
|
||||||
panel: app.HardwarePanelData{
|
|
||||||
Header: []string{"Demo header"},
|
|
||||||
Rows: []app.ComponentRow{
|
|
||||||
{Key: "CPU", Status: "PASS", Detail: "ok"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if cmd != nil {
|
|
||||||
t.Fatal("expected nil cmd")
|
|
||||||
}
|
|
||||||
if got.banner != "System: Demo" {
|
|
||||||
t.Fatalf("banner=%q want %q", got.banner, "System: Demo")
|
|
||||||
}
|
|
||||||
if len(got.panel.Rows) != 1 || got.panel.Rows[0].Key != "CPU" {
|
|
||||||
t.Fatalf("panel rows=%+v", got.panel.Rows)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenExportTargets
|
|
||||||
m.targets = []platform.RemovableTarget{
|
|
||||||
{
|
|
||||||
Device: "/dev/sdb1",
|
|
||||||
FSType: "vfat",
|
|
||||||
Size: "29G",
|
|
||||||
Label: "BEEUSB",
|
|
||||||
Mountpoint: "/media/bee",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
|
|
||||||
for _, want := range []string{
|
|
||||||
"Export support bundle",
|
|
||||||
"Select removable filesystem",
|
|
||||||
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(view, want) {
|
|
||||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestViewStaticFormRendersFields(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenStaticForm
|
|
||||||
m.selectedIface = "enp1s0"
|
|
||||||
m.formFields = []formField{
|
|
||||||
{Label: "Address", Value: "192.0.2.10/24"},
|
|
||||||
{Label: "Gateway", Value: "192.0.2.1"},
|
|
||||||
{Label: "DNS", Value: "1.1.1.1"},
|
|
||||||
}
|
|
||||||
m.formIndex = 1
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
|
|
||||||
for _, want := range []string{
|
|
||||||
"Static IPv4: enp1s0",
|
|
||||||
" Address: 192.0.2.10/24",
|
|
||||||
"> Gateway: 192.0.2.1",
|
|
||||||
" DNS: 1.1.1.1",
|
|
||||||
"[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(view, want) {
|
|
||||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestViewConfirmScreenMatchesPendingExport(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.screen = screenConfirm
|
|
||||||
m.pendingAction = actionExportBundle
|
|
||||||
m.selectedTarget = &platform.RemovableTarget{Device: "/dev/sdb1"}
|
|
||||||
|
|
||||||
view := m.View()
|
|
||||||
|
|
||||||
for _, want := range []string{
|
|
||||||
"Export support bundle",
|
|
||||||
"Copy support bundle to /dev/sdb1?",
|
|
||||||
"> Confirm",
|
|
||||||
" Cancel",
|
|
||||||
} {
|
|
||||||
if !strings.Contains(view, want) {
|
|
||||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestResultMsgClearsBusyAndPendingAction(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = "Export support bundle"
|
|
||||||
m.pendingAction = actionExportBundle
|
|
||||||
m.screen = screenConfirm
|
|
||||||
|
|
||||||
next, _ := m.Update(resultMsg{title: "Export support bundle", body: "done", back: screenMain})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if got.busy {
|
|
||||||
t.Fatal("busy=true want false")
|
|
||||||
}
|
|
||||||
if got.busyTitle != "" {
|
|
||||||
t.Fatalf("busyTitle=%q want empty", got.busyTitle)
|
|
||||||
}
|
|
||||||
if got.pendingAction != actionNone {
|
|
||||||
t.Fatalf("pendingAction=%q want empty", got.pendingAction)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestResultMsgErrorWithoutBodyFormatsCleanly(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
m := newTestModel()
|
|
||||||
|
|
||||||
next, _ := m.Update(resultMsg{title: "Export support bundle", err: assertErr("boom"), back: screenMain})
|
|
||||||
got := next.(model)
|
|
||||||
|
|
||||||
if got.body != "ERROR: boom" {
|
|
||||||
t.Fatalf("body=%q want %q", got.body, "ERROR: boom")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type assertErr string
|
|
||||||
|
|
||||||
func (e assertErr) Error() string { return string(e) }
|
|
||||||
@@ -1,200 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
"bee/audit/internal/runtimeenv"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
type screen string
|
|
||||||
|
|
||||||
const (
|
|
||||||
screenMain screen = "main"
|
|
||||||
screenHealthCheck screen = "health_check"
|
|
||||||
screenSettings screen = "settings"
|
|
||||||
screenNetwork screen = "network"
|
|
||||||
screenInterfacePick screen = "interface_pick"
|
|
||||||
screenServices screen = "services"
|
|
||||||
screenServiceAction screen = "service_action"
|
|
||||||
screenExportTargets screen = "export_targets"
|
|
||||||
screenOutput screen = "output"
|
|
||||||
screenStaticForm screen = "static_form"
|
|
||||||
screenConfirm screen = "confirm"
|
|
||||||
screenNvidiaSATSetup screen = "nvidia_sat_setup"
|
|
||||||
screenNvidiaSATRunning screen = "nvidia_sat_running"
|
|
||||||
)
|
|
||||||
|
|
||||||
type actionKind string
|
|
||||||
|
|
||||||
const (
|
|
||||||
actionNone actionKind = ""
|
|
||||||
actionDHCPOne actionKind = "dhcp_one"
|
|
||||||
actionStaticIPv4 actionKind = "static_ipv4"
|
|
||||||
actionExportBundle actionKind = "export_bundle"
|
|
||||||
actionRunAll actionKind = "run_all"
|
|
||||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
|
||||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
|
||||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
|
||||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
|
||||||
actionRunFanStress actionKind = "run_fan_stress"
|
|
||||||
)
|
|
||||||
|
|
||||||
type model struct {
|
|
||||||
app *app.App
|
|
||||||
runtimeMode runtimeenv.Mode
|
|
||||||
|
|
||||||
screen screen
|
|
||||||
prevScreen screen
|
|
||||||
cursor int
|
|
||||||
busy bool
|
|
||||||
busyTitle string
|
|
||||||
title string
|
|
||||||
body string
|
|
||||||
mainMenu []string
|
|
||||||
settingsMenu []string
|
|
||||||
networkMenu []string
|
|
||||||
serviceMenu []string
|
|
||||||
|
|
||||||
services []string
|
|
||||||
interfaces []platform.InterfaceInfo
|
|
||||||
targets []platform.RemovableTarget
|
|
||||||
selectedService string
|
|
||||||
selectedIface string
|
|
||||||
selectedTarget *platform.RemovableTarget
|
|
||||||
pendingAction actionKind
|
|
||||||
|
|
||||||
formFields []formField
|
|
||||||
formIndex int
|
|
||||||
|
|
||||||
// Hardware panel (right column)
|
|
||||||
panel app.HardwarePanelData
|
|
||||||
panelFocus bool
|
|
||||||
panelCursor int
|
|
||||||
banner string
|
|
||||||
|
|
||||||
// Health Check screen
|
|
||||||
hcSel [4]bool
|
|
||||||
hcMode int
|
|
||||||
hcCursor int
|
|
||||||
hcInitialized bool
|
|
||||||
|
|
||||||
// NVIDIA SAT setup
|
|
||||||
nvidiaGPUs []platform.NvidiaGPU
|
|
||||||
nvidiaGPUSel []bool
|
|
||||||
nvidiaDurIdx int
|
|
||||||
nvidiaSATCursor int
|
|
||||||
|
|
||||||
// NVIDIA SAT running
|
|
||||||
nvidiaSATCancel func()
|
|
||||||
nvidiaSATAborted bool
|
|
||||||
|
|
||||||
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
|
|
||||||
progressLines []string
|
|
||||||
progressPrefix string
|
|
||||||
progressSince time.Time
|
|
||||||
|
|
||||||
// Terminal size
|
|
||||||
width int
|
|
||||||
}
|
|
||||||
|
|
||||||
type formField struct {
|
|
||||||
Label string
|
|
||||||
Value string
|
|
||||||
}
|
|
||||||
|
|
||||||
func Run(application *app.App, runtimeMode runtimeenv.Mode) error {
|
|
||||||
options := []tea.ProgramOption{}
|
|
||||||
if runtimeMode != runtimeenv.ModeLiveCD {
|
|
||||||
options = append(options, tea.WithAltScreen())
|
|
||||||
}
|
|
||||||
program := tea.NewProgram(newModel(application, runtimeMode), options...)
|
|
||||||
_, err := program.Run()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
|
|
||||||
return model{
|
|
||||||
app: application,
|
|
||||||
runtimeMode: runtimeMode,
|
|
||||||
screen: screenMain,
|
|
||||||
mainMenu: []string{
|
|
||||||
"Health Check",
|
|
||||||
"Export support bundle",
|
|
||||||
"Settings",
|
|
||||||
"Exit",
|
|
||||||
},
|
|
||||||
settingsMenu: []string{
|
|
||||||
"Network",
|
|
||||||
"Services",
|
|
||||||
"Re-run audit",
|
|
||||||
"Run self-check",
|
|
||||||
"Runtime issues",
|
|
||||||
"Audit logs",
|
|
||||||
"Check tools",
|
|
||||||
"Back",
|
|
||||||
},
|
|
||||||
networkMenu: []string{
|
|
||||||
"Show status",
|
|
||||||
"DHCP on all interfaces",
|
|
||||||
"DHCP on one interface",
|
|
||||||
"Set static IPv4",
|
|
||||||
"Back",
|
|
||||||
},
|
|
||||||
serviceMenu: []string{
|
|
||||||
"Status",
|
|
||||||
"Restart",
|
|
||||||
"Start",
|
|
||||||
"Stop",
|
|
||||||
"Back",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) Init() tea.Cmd {
|
|
||||||
return m.refreshSnapshotCmd()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) confirmBody() (string, string) {
|
|
||||||
switch m.pendingAction {
|
|
||||||
case actionExportBundle:
|
|
||||||
if m.selectedTarget == nil {
|
|
||||||
return "Export support bundle", "No target selected"
|
|
||||||
}
|
|
||||||
return "Export support bundle", "Copy support bundle to " + m.selectedTarget.Device + "?"
|
|
||||||
case actionRunAll:
|
|
||||||
modes := []string{"Quick", "Standard", "Express"}
|
|
||||||
mode := modes[m.hcMode]
|
|
||||||
var sel []string
|
|
||||||
names := []string{"GPU", "Memory", "Storage", "CPU"}
|
|
||||||
for i, on := range m.hcSel {
|
|
||||||
if on {
|
|
||||||
sel = append(sel, names[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(sel) == 0 {
|
|
||||||
return "Health Check", "No components selected."
|
|
||||||
}
|
|
||||||
return "Health Check", "Run: " + strings.Join(sel, " + ") + "\nMode: " + mode
|
|
||||||
case actionRunMemorySAT:
|
|
||||||
return "Memory test", "Run memtester?"
|
|
||||||
case actionRunStorageSAT:
|
|
||||||
return "Storage test", "Run storage diagnostic pack?"
|
|
||||||
case actionRunCPUSAT:
|
|
||||||
modes := []string{"Quick (60s)", "Standard (300s)", "Express (900s)"}
|
|
||||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
|
||||||
case actionRunAMDGPUSAT:
|
|
||||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
|
||||||
case actionRunFanStress:
|
|
||||||
modes := []string{"Quick (2×2min)", "Standard (2×5min)", "Express (2×10min)"}
|
|
||||||
return "Fan Stress Test", "Two-phase GPU thermal cycling test.\n" +
|
|
||||||
"Monitors fans, temps, power — detects throttling.\n" +
|
|
||||||
"Mode: " + modes[m.hcMode] + "\n\nAll NVIDIA GPUs will be stressed."
|
|
||||||
default:
|
|
||||||
return "Confirm", "Proceed?"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,260 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
)
|
|
||||||
|
|
||||||
func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
|
||||||
switch msg := msg.(type) {
|
|
||||||
case tea.WindowSizeMsg:
|
|
||||||
m.width = msg.Width
|
|
||||||
return m, nil
|
|
||||||
case tea.KeyMsg:
|
|
||||||
if m.busy {
|
|
||||||
if msg.String() == "ctrl+c" {
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
next, cmd := m.updateKey(msg)
|
|
||||||
nextModel := next.(model)
|
|
||||||
if shouldRefreshSnapshot(m, nextModel) {
|
|
||||||
return nextModel, tea.Batch(cmd, nextModel.refreshSnapshotCmd())
|
|
||||||
}
|
|
||||||
return nextModel, cmd
|
|
||||||
case satProgressMsg:
|
|
||||||
if m.busy && m.progressPrefix != "" {
|
|
||||||
if len(msg.lines) > 0 {
|
|
||||||
m.progressLines = msg.lines
|
|
||||||
}
|
|
||||||
return m, pollSATProgress(m.progressPrefix, m.progressSince)
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
case snapshotMsg:
|
|
||||||
m.banner = msg.banner
|
|
||||||
m.panel = msg.panel
|
|
||||||
return m, nil
|
|
||||||
case resultMsg:
|
|
||||||
m.busy = false
|
|
||||||
m.busyTitle = ""
|
|
||||||
m.progressLines = nil
|
|
||||||
m.progressPrefix = ""
|
|
||||||
m.title = msg.title
|
|
||||||
if msg.err != nil {
|
|
||||||
body := strings.TrimSpace(msg.body)
|
|
||||||
if body == "" {
|
|
||||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
|
||||||
} else {
|
|
||||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
m.body = msg.body
|
|
||||||
}
|
|
||||||
m.pendingAction = actionNone
|
|
||||||
if msg.back != "" {
|
|
||||||
m.prevScreen = msg.back
|
|
||||||
} else {
|
|
||||||
m.prevScreen = m.screen
|
|
||||||
}
|
|
||||||
m.screen = screenOutput
|
|
||||||
m.cursor = 0
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
case servicesMsg:
|
|
||||||
m.busy = false
|
|
||||||
m.busyTitle = ""
|
|
||||||
if msg.err != nil {
|
|
||||||
m.title = "Services"
|
|
||||||
m.body = msg.err.Error()
|
|
||||||
m.prevScreen = screenSettings
|
|
||||||
m.screen = screenOutput
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
}
|
|
||||||
m.services = msg.services
|
|
||||||
m.screen = screenServices
|
|
||||||
m.cursor = 0
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
case interfacesMsg:
|
|
||||||
m.busy = false
|
|
||||||
m.busyTitle = ""
|
|
||||||
if msg.err != nil {
|
|
||||||
m.title = "interfaces"
|
|
||||||
m.body = msg.err.Error()
|
|
||||||
m.prevScreen = screenNetwork
|
|
||||||
m.screen = screenOutput
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
}
|
|
||||||
m.interfaces = msg.ifaces
|
|
||||||
m.screen = screenInterfacePick
|
|
||||||
m.cursor = 0
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
case exportTargetsMsg:
|
|
||||||
m.busy = false
|
|
||||||
m.busyTitle = ""
|
|
||||||
if msg.err != nil {
|
|
||||||
m.title = "export"
|
|
||||||
m.body = msg.err.Error()
|
|
||||||
m.prevScreen = screenMain
|
|
||||||
m.screen = screenOutput
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
}
|
|
||||||
m.targets = msg.targets
|
|
||||||
m.screen = screenExportTargets
|
|
||||||
m.cursor = 0
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
case nvidiaGPUsMsg:
|
|
||||||
return m.handleNvidiaGPUsMsg(msg)
|
|
||||||
case nvtopClosedMsg:
|
|
||||||
return m, nil
|
|
||||||
case nvidiaSATDoneMsg:
|
|
||||||
if m.nvidiaSATAborted {
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
if m.nvidiaSATCancel != nil {
|
|
||||||
m.nvidiaSATCancel()
|
|
||||||
m.nvidiaSATCancel = nil
|
|
||||||
}
|
|
||||||
m.prevScreen = screenHealthCheck
|
|
||||||
m.screen = screenOutput
|
|
||||||
m.title = msg.title
|
|
||||||
if msg.err != nil {
|
|
||||||
body := strings.TrimSpace(msg.body)
|
|
||||||
if body == "" {
|
|
||||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
|
||||||
} else {
|
|
||||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
m.body = msg.body
|
|
||||||
}
|
|
||||||
return m, m.refreshSnapshotCmd()
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
switch m.screen {
|
|
||||||
case screenMain:
|
|
||||||
return m.updateMain(msg)
|
|
||||||
case screenHealthCheck:
|
|
||||||
return m.updateHealthCheck(msg)
|
|
||||||
case screenSettings:
|
|
||||||
return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
|
|
||||||
case screenNetwork:
|
|
||||||
return m.updateMenu(msg, len(m.networkMenu), m.handleNetworkMenu)
|
|
||||||
case screenServices:
|
|
||||||
return m.updateMenu(msg, len(m.services), m.handleServicesMenu)
|
|
||||||
case screenServiceAction:
|
|
||||||
return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu)
|
|
||||||
case screenNvidiaSATSetup:
|
|
||||||
return m.updateNvidiaSATSetup(msg)
|
|
||||||
case screenNvidiaSATRunning:
|
|
||||||
return m.updateNvidiaSATRunning(msg)
|
|
||||||
case screenExportTargets:
|
|
||||||
return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
|
|
||||||
case screenInterfacePick:
|
|
||||||
return m.updateMenu(msg, len(m.interfaces), m.handleInterfacePickMenu)
|
|
||||||
case screenOutput:
|
|
||||||
switch msg.String() {
|
|
||||||
case "esc", "enter", "q":
|
|
||||||
m.screen = m.prevScreen
|
|
||||||
m.body = ""
|
|
||||||
m.title = ""
|
|
||||||
m.pendingAction = actionNone
|
|
||||||
return m, nil
|
|
||||||
case "ctrl+c":
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
case screenStaticForm:
|
|
||||||
return m.updateStaticForm(msg)
|
|
||||||
case screenConfirm:
|
|
||||||
return m.updateConfirm(msg)
|
|
||||||
}
|
|
||||||
if msg.String() == "ctrl+c" {
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// updateMain handles keys on the main (two-column) screen.
|
|
||||||
func (m model) updateMain(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
if m.panelFocus {
|
|
||||||
return m.updateMainPanel(msg)
|
|
||||||
}
|
|
||||||
// Switch focus to right panel.
|
|
||||||
if (msg.String() == "tab" || msg.String() == "right" || msg.String() == "l") && len(m.panel.Rows) > 0 {
|
|
||||||
m.panelFocus = true
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
return m.updateMenu(msg, len(m.mainMenu), m.handleMainMenu)
|
|
||||||
}
|
|
||||||
|
|
||||||
// updateMainPanel handles keys when right panel has focus.
|
|
||||||
func (m model) updateMainPanel(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
||||||
switch msg.String() {
|
|
||||||
case "up", "k":
|
|
||||||
if m.panelCursor > 0 {
|
|
||||||
m.panelCursor--
|
|
||||||
}
|
|
||||||
case "down", "j":
|
|
||||||
if m.panelCursor < len(m.panel.Rows)-1 {
|
|
||||||
m.panelCursor++
|
|
||||||
}
|
|
||||||
case "enter":
|
|
||||||
if m.panelCursor < len(m.panel.Rows) {
|
|
||||||
key := m.panel.Rows[m.panelCursor].Key
|
|
||||||
m.busy = true
|
|
||||||
m.busyTitle = key
|
|
||||||
return m, func() tea.Msg {
|
|
||||||
r := m.app.ComponentDetailResult(key)
|
|
||||||
return resultMsg{title: r.Title, body: r.Body, back: screenMain}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case "tab", "left", "h", "esc":
|
|
||||||
m.panelFocus = false
|
|
||||||
case "q", "ctrl+c":
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) updateMenu(msg tea.KeyMsg, size int, onEnter func() (tea.Model, tea.Cmd)) (tea.Model, tea.Cmd) {
|
|
||||||
if size == 0 {
|
|
||||||
size = 1
|
|
||||||
}
|
|
||||||
switch msg.String() {
|
|
||||||
case "up", "k":
|
|
||||||
if m.cursor > 0 {
|
|
||||||
m.cursor--
|
|
||||||
}
|
|
||||||
case "down", "j":
|
|
||||||
if m.cursor < size-1 {
|
|
||||||
m.cursor++
|
|
||||||
}
|
|
||||||
case "enter":
|
|
||||||
return onEnter()
|
|
||||||
case "esc":
|
|
||||||
switch m.screen {
|
|
||||||
case screenNetwork, screenServices:
|
|
||||||
m.screen = screenSettings
|
|
||||||
m.cursor = 0
|
|
||||||
case screenSettings:
|
|
||||||
m.screen = screenMain
|
|
||||||
m.cursor = 0
|
|
||||||
case screenServiceAction:
|
|
||||||
m.screen = screenServices
|
|
||||||
m.cursor = 0
|
|
||||||
case screenExportTargets:
|
|
||||||
m.screen = screenMain
|
|
||||||
m.cursor = 0
|
|
||||||
case screenInterfacePick:
|
|
||||||
m.screen = screenNetwork
|
|
||||||
m.cursor = 0
|
|
||||||
}
|
|
||||||
case "q", "ctrl+c":
|
|
||||||
return m, tea.Quit
|
|
||||||
}
|
|
||||||
return m, nil
|
|
||||||
}
|
|
||||||
@@ -1,294 +0,0 @@
|
|||||||
package tui
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
|
||||||
|
|
||||||
tea "github.com/charmbracelet/bubbletea"
|
|
||||||
"github.com/charmbracelet/lipgloss"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Column widths for two-column main layout.
|
|
||||||
const leftColWidth = 30
|
|
||||||
|
|
||||||
var (
|
|
||||||
stylePass = lipgloss.NewStyle().Foreground(lipgloss.Color("10")) // bright green
|
|
||||||
styleFail = lipgloss.NewStyle().Foreground(lipgloss.Color("9")) // bright red
|
|
||||||
styleCancel = lipgloss.NewStyle().Foreground(lipgloss.Color("11")) // bright yellow
|
|
||||||
styleNA = lipgloss.NewStyle().Foreground(lipgloss.Color("8")) // dark gray
|
|
||||||
)
|
|
||||||
|
|
||||||
func colorStatus(status string) string {
|
|
||||||
switch status {
|
|
||||||
case "PASS":
|
|
||||||
return stylePass.Render("PASS")
|
|
||||||
case "FAIL":
|
|
||||||
return styleFail.Render("FAIL")
|
|
||||||
case "CANCEL":
|
|
||||||
return styleCancel.Render("CANC")
|
|
||||||
default:
|
|
||||||
return styleNA.Render("N/A ")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) View() string {
|
|
||||||
var body string
|
|
||||||
if m.busy {
|
|
||||||
title := "bee"
|
|
||||||
if m.busyTitle != "" {
|
|
||||||
title = m.busyTitle
|
|
||||||
}
|
|
||||||
if len(m.progressLines) > 0 {
|
|
||||||
var b strings.Builder
|
|
||||||
fmt.Fprintf(&b, "%s\n\n", title)
|
|
||||||
for _, l := range m.progressLines {
|
|
||||||
fmt.Fprintf(&b, " %s\n", l)
|
|
||||||
}
|
|
||||||
b.WriteString("\n[ctrl+c] quit\n")
|
|
||||||
body = b.String()
|
|
||||||
} else {
|
|
||||||
body = fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
switch m.screen {
|
|
||||||
case screenMain:
|
|
||||||
body = renderTwoColumnMain(m)
|
|
||||||
case screenHealthCheck:
|
|
||||||
body = renderHealthCheck(m)
|
|
||||||
case screenSettings:
|
|
||||||
body = renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
|
|
||||||
case screenNetwork:
|
|
||||||
body = renderMenu("Network", "Select action", m.networkMenu, m.cursor)
|
|
||||||
case screenServices:
|
|
||||||
body = renderMenu("Services", "Select service", m.services, m.cursor)
|
|
||||||
case screenServiceAction:
|
|
||||||
body = renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
|
|
||||||
case screenExportTargets:
|
|
||||||
body = renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
|
|
||||||
case screenInterfacePick:
|
|
||||||
body = renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
|
|
||||||
case screenStaticForm:
|
|
||||||
body = renderForm("Static IPv4: "+m.selectedIface, m.formFields, m.formIndex)
|
|
||||||
case screenConfirm:
|
|
||||||
title, confirmBody := m.confirmBody()
|
|
||||||
body = renderConfirm(title, confirmBody, m.cursor)
|
|
||||||
case screenNvidiaSATSetup:
|
|
||||||
body = renderNvidiaSATSetup(m)
|
|
||||||
case screenNvidiaSATRunning:
|
|
||||||
body = renderNvidiaSATRunning()
|
|
||||||
case screenOutput:
|
|
||||||
body = fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
|
||||||
default:
|
|
||||||
body = "bee\n"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return m.renderWithBanner(body)
|
|
||||||
}
|
|
||||||
|
|
||||||
// renderTwoColumnMain renders the main screen with menu on the left and hardware panel on the right.
|
|
||||||
func renderTwoColumnMain(m model) string {
|
|
||||||
// Left column lines
|
|
||||||
leftLines := []string{"bee", ""}
|
|
||||||
for i, item := range m.mainMenu {
|
|
||||||
pfx := " "
|
|
||||||
if !m.panelFocus && m.cursor == i {
|
|
||||||
pfx = "> "
|
|
||||||
}
|
|
||||||
leftLines = append(leftLines, pfx+item)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Right column lines
|
|
||||||
rightLines := buildPanelLines(m)
|
|
||||||
|
|
||||||
// Render side by side
|
|
||||||
var b strings.Builder
|
|
||||||
maxRows := max(len(leftLines), len(rightLines))
|
|
||||||
for i := 0; i < maxRows; i++ {
|
|
||||||
l := ""
|
|
||||||
if i < len(leftLines) {
|
|
||||||
l = leftLines[i]
|
|
||||||
}
|
|
||||||
r := ""
|
|
||||||
if i < len(rightLines) {
|
|
||||||
r = rightLines[i]
|
|
||||||
}
|
|
||||||
w := lipgloss.Width(l)
|
|
||||||
if w < leftColWidth {
|
|
||||||
l += strings.Repeat(" ", leftColWidth-w)
|
|
||||||
}
|
|
||||||
b.WriteString(l + " │ " + r + "\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
sep := strings.Repeat("─", leftColWidth) + "─┴─" + strings.Repeat("─", 46)
|
|
||||||
b.WriteString(sep + "\n")
|
|
||||||
|
|
||||||
if m.panelFocus {
|
|
||||||
b.WriteString("[↑↓] move [enter] details [tab/←] menu [ctrl+c] quit\n")
|
|
||||||
} else {
|
|
||||||
b.WriteString("[↑↓] move [enter] select [tab/→] panel [ctrl+c] quit\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func buildPanelLines(m model) []string {
|
|
||||||
p := m.panel
|
|
||||||
var lines []string
|
|
||||||
|
|
||||||
for _, h := range p.Header {
|
|
||||||
lines = append(lines, h)
|
|
||||||
}
|
|
||||||
if len(p.Header) > 0 && len(p.Rows) > 0 {
|
|
||||||
lines = append(lines, "")
|
|
||||||
}
|
|
||||||
|
|
||||||
for i, row := range p.Rows {
|
|
||||||
pfx := " "
|
|
||||||
if m.panelFocus && m.panelCursor == i {
|
|
||||||
pfx = "> "
|
|
||||||
}
|
|
||||||
status := colorStatus(row.Status)
|
|
||||||
lines = append(lines, fmt.Sprintf("%s%s %-4s %s", pfx, status, row.Key, row.Detail))
|
|
||||||
}
|
|
||||||
|
|
||||||
return lines
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderTargetItems(targets []platform.RemovableTarget) []string {
|
|
||||||
items := make([]string, 0, len(targets))
|
|
||||||
for _, target := range targets {
|
|
||||||
desc := fmt.Sprintf("%s [%s %s]", target.Device, target.FSType, target.Size)
|
|
||||||
if target.Label != "" {
|
|
||||||
desc += " label=" + target.Label
|
|
||||||
}
|
|
||||||
if target.Mountpoint != "" {
|
|
||||||
desc += " mounted=" + target.Mountpoint
|
|
||||||
}
|
|
||||||
items = append(items, desc)
|
|
||||||
}
|
|
||||||
return items
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderInterfaceItems(interfaces []platform.InterfaceInfo) []string {
|
|
||||||
items := make([]string, 0, len(interfaces))
|
|
||||||
for _, iface := range interfaces {
|
|
||||||
label := iface.Name
|
|
||||||
if len(iface.IPv4) > 0 {
|
|
||||||
label += " [" + strings.Join(iface.IPv4, ", ") + "]"
|
|
||||||
}
|
|
||||||
items = append(items, label)
|
|
||||||
}
|
|
||||||
return items
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderMenu(title, subtitle string, items []string, cursor int) string {
|
|
||||||
var body strings.Builder
|
|
||||||
fmt.Fprintf(&body, "%s\n\n%s\n\n", title, subtitle)
|
|
||||||
if len(items) == 0 {
|
|
||||||
body.WriteString("(no items)\n")
|
|
||||||
} else {
|
|
||||||
for i, item := range items {
|
|
||||||
prefix := " "
|
|
||||||
if i == cursor {
|
|
||||||
prefix = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&body, "%s%s\n", prefix, item)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n")
|
|
||||||
return body.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderForm(title string, fields []formField, idx int) string {
|
|
||||||
var body strings.Builder
|
|
||||||
fmt.Fprintf(&body, "%s\n\n", title)
|
|
||||||
for i, field := range fields {
|
|
||||||
prefix := " "
|
|
||||||
if i == idx {
|
|
||||||
prefix = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&body, "%s%s: %s\n", prefix, field.Label, field.Value)
|
|
||||||
}
|
|
||||||
body.WriteString("\n[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel\n")
|
|
||||||
return body.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderConfirm(title, body string, cursor int) string {
|
|
||||||
options := []string{"Confirm", "Cancel"}
|
|
||||||
var out strings.Builder
|
|
||||||
fmt.Fprintf(&out, "%s\n\n%s\n\n", title, body)
|
|
||||||
for i, option := range options {
|
|
||||||
prefix := " "
|
|
||||||
if i == cursor {
|
|
||||||
prefix = "> "
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&out, "%s%s\n", prefix, option)
|
|
||||||
}
|
|
||||||
out.WriteString("\n[←/→/↑/↓] move [enter] select [esc] cancel\n")
|
|
||||||
return out.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func resultCmd(title, body string, err error, back screen) tea.Cmd {
|
|
||||||
return func() tea.Msg {
|
|
||||||
return resultMsg{title: title, body: body, err: err, back: back}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m model) renderWithBanner(body string) string {
|
|
||||||
body = strings.TrimRight(body, "\n")
|
|
||||||
banner := renderBannerModule(m.banner, m.width)
|
|
||||||
if banner == "" {
|
|
||||||
if body == "" {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return body + "\n"
|
|
||||||
}
|
|
||||||
if body == "" {
|
|
||||||
return banner + "\n"
|
|
||||||
}
|
|
||||||
return banner + "\n\n" + body + "\n"
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderBannerModule(banner string, width int) string {
|
|
||||||
banner = strings.TrimSpace(banner)
|
|
||||||
if banner == "" {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
lines := strings.Split(banner, "\n")
|
|
||||||
contentWidth := 0
|
|
||||||
for _, line := range lines {
|
|
||||||
if w := lipgloss.Width(line); w > contentWidth {
|
|
||||||
contentWidth = w
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if width > 0 && width-4 > contentWidth {
|
|
||||||
contentWidth = width - 4
|
|
||||||
}
|
|
||||||
if contentWidth < 20 {
|
|
||||||
contentWidth = 20
|
|
||||||
}
|
|
||||||
|
|
||||||
label := " MOTD "
|
|
||||||
topFill := contentWidth + 2 - lipgloss.Width(label)
|
|
||||||
if topFill < 0 {
|
|
||||||
topFill = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
var b strings.Builder
|
|
||||||
b.WriteString("┌" + label + strings.Repeat("─", topFill) + "┐\n")
|
|
||||||
for _, line := range lines {
|
|
||||||
b.WriteString("│ " + padRight(line, contentWidth) + " │\n")
|
|
||||||
}
|
|
||||||
b.WriteString("└" + strings.Repeat("─", contentWidth+2) + "┘")
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func padRight(value string, width int) string {
|
|
||||||
if gap := width - lipgloss.Width(value); gap > 0 {
|
|
||||||
return value + strings.Repeat(" ", gap)
|
|
||||||
}
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
1146
audit/internal/webui/api.go
Normal file
1146
audit/internal/webui/api.go
Normal file
File diff suppressed because it is too large
Load Diff
128
audit/internal/webui/api_test.go
Normal file
128
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http/httptest"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||||
|
t.Setenv("DISPLAY", "")
|
||||||
|
t.Setenv("XAUTHORITY", "")
|
||||||
|
|
||||||
|
cmd := xrandrCommand("--query")
|
||||||
|
|
||||||
|
var hasDisplay bool
|
||||||
|
var hasXAuthority bool
|
||||||
|
for _, kv := range cmd.Env {
|
||||||
|
if kv == "DISPLAY=:0" {
|
||||||
|
hasDisplay = true
|
||||||
|
}
|
||||||
|
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||||
|
hasXAuthority = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasDisplay {
|
||||||
|
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||||
|
}
|
||||||
|
if !hasXAuthority {
|
||||||
|
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||||
|
req.ContentLength = -1
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||||
|
t.Fatalf("burn profile=%q want smoke", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
task := globalQueue.tasks[0]
|
||||||
|
if task.Target != "nvidia-benchmark" {
|
||||||
|
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
||||||
|
}
|
||||||
|
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||||
|
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||||
|
}
|
||||||
|
if task.params.RunNCCL {
|
||||||
|
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
|
h := &handler{}
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_A", RPM: 4200},
|
||||||
|
{Name: "FAN_B", RPM: 5100},
|
||||||
|
})
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_B", RPM: 5200},
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||||
|
t.Fatalf("fanNames=%v", h.fanNames)
|
||||||
|
}
|
||||||
|
aVals, _ := h.ringFans[0].snapshot()
|
||||||
|
bVals, _ := h.ringFans[1].snapshot()
|
||||||
|
if len(aVals) != 2 || len(bVals) != 2 {
|
||||||
|
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||||
|
}
|
||||||
|
if aVals[1] != 4200 {
|
||||||
|
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||||
|
}
|
||||||
|
if bVals[1] != 5200 {
|
||||||
|
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||||
|
}
|
||||||
|
}
|
||||||
773
audit/internal/webui/charts_svg.go
Normal file
773
audit/internal/webui/charts_svg.go
Normal file
@@ -0,0 +1,773 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
type chartTimelineSegment struct {
|
||||||
|
Start time.Time
|
||||||
|
End time.Time
|
||||||
|
Active bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type chartScale struct {
|
||||||
|
Min float64
|
||||||
|
Max float64
|
||||||
|
Ticks []float64
|
||||||
|
}
|
||||||
|
|
||||||
|
type chartLayout struct {
|
||||||
|
Width int
|
||||||
|
Height int
|
||||||
|
PlotLeft int
|
||||||
|
PlotRight int
|
||||||
|
PlotTop int
|
||||||
|
PlotBottom int
|
||||||
|
}
|
||||||
|
|
||||||
|
type metricChartSeries struct {
|
||||||
|
Name string
|
||||||
|
AxisTitle string
|
||||||
|
Color string
|
||||||
|
Values []float64
|
||||||
|
}
|
||||||
|
|
||||||
|
var metricChartPalette = []string{
|
||||||
|
"#5794f2",
|
||||||
|
"#73bf69",
|
||||||
|
"#f2cc0c",
|
||||||
|
"#ff9830",
|
||||||
|
"#f2495c",
|
||||||
|
"#b877d9",
|
||||||
|
"#56d2f7",
|
||||||
|
"#8ab8ff",
|
||||||
|
"#9adf8f",
|
||||||
|
"#ffbe5c",
|
||||||
|
}
|
||||||
|
|
||||||
|
var gpuLabelCache struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
loadedAt time.Time
|
||||||
|
byIndex map[int]string
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{time.Time{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range datasets {
|
||||||
|
if len(datasets[i]) == 0 {
|
||||||
|
datasets[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
statsLabel := chartStatsLabel(datasets)
|
||||||
|
|
||||||
|
legendItems := []metricChartSeries{}
|
||||||
|
for i, name := range names {
|
||||||
|
color := metricChartPalette[i%len(metricChartPalette)]
|
||||||
|
values := make([]float64, pointCount)
|
||||||
|
if i < len(datasets) {
|
||||||
|
copy(values, coalesceDataset(datasets[i], pointCount))
|
||||||
|
}
|
||||||
|
legendItems = append(legendItems, metricChartSeries{
|
||||||
|
Name: name,
|
||||||
|
Color: color,
|
||||||
|
Values: values,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
scale := singleAxisChartScale(datasets, yMin, yMax)
|
||||||
|
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
writeSingleAxisY(&b, layout, scale)
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
for _, item := range legendItems {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
|
||||||
|
}
|
||||||
|
writeLegend(&b, layout, legendItems)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
|
||||||
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
|
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
|
if temp == nil && power == nil && coreClock == nil {
|
||||||
|
return nil, false, nil
|
||||||
|
}
|
||||||
|
labels := sampleTimeLabels(samples)
|
||||||
|
times := sampleTimes(samples)
|
||||||
|
svg, err := drawGPUOverviewChartSVG(
|
||||||
|
gpuDisplayLabel(idx)+" Overview",
|
||||||
|
labels,
|
||||||
|
times,
|
||||||
|
[]metricChartSeries{
|
||||||
|
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
|
||||||
|
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
|
||||||
|
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
|
||||||
|
},
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, false, err
|
||||||
|
}
|
||||||
|
return svg, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
if len(series) != 3 {
|
||||||
|
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
|
||||||
|
}
|
||||||
|
const (
|
||||||
|
width = 1400
|
||||||
|
height = 840
|
||||||
|
plotLeft = 180
|
||||||
|
plotRight = 1220
|
||||||
|
plotTop = 96
|
||||||
|
plotBottom = 660
|
||||||
|
)
|
||||||
|
const (
|
||||||
|
leftOuterAxis = 72
|
||||||
|
leftInnerAxis = 132
|
||||||
|
rightInnerAxis = 1268
|
||||||
|
)
|
||||||
|
layout := chartLayout{
|
||||||
|
Width: width,
|
||||||
|
Height: height,
|
||||||
|
PlotLeft: plotLeft,
|
||||||
|
PlotRight: plotRight,
|
||||||
|
PlotTop: plotTop,
|
||||||
|
PlotBottom: plotBottom,
|
||||||
|
}
|
||||||
|
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{time.Time{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range series {
|
||||||
|
if len(series[i].Values) == 0 {
|
||||||
|
series[i].Values = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
scales := make([]chartScale, len(series))
|
||||||
|
for i := range series {
|
||||||
|
min, max := chartSeriesBounds(series[i].Values)
|
||||||
|
ticks := chartNiceTicks(min, max, 8)
|
||||||
|
scales[i] = chartScale{
|
||||||
|
Min: ticks[0],
|
||||||
|
Max: ticks[len(ticks)-1],
|
||||||
|
Ticks: ticks,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, width, height)
|
||||||
|
writeChartFrame(&b, title, "", width, height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scales[0])
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
|
||||||
|
for i, axisLineX := range axisX {
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
|
||||||
|
for _, tick := range scales[i].Ticks {
|
||||||
|
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
|
||||||
|
label := sanitizeChartText(chartYAxisNumber(tick))
|
||||||
|
if i < 2 {
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, y, axisLineX+6, y, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX-8, y, series[i].Color, label)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, y, axisLineX-6, y, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX+8, y, series[i].Color, label)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
for i := range series {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
|
||||||
|
}
|
||||||
|
writeLegend(&b, layout, series)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
times := sampleTimes(samples)
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
|
||||||
|
}
|
||||||
|
|
||||||
|
func snapshotTaskHistory() []Task {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
out := make([]Task, len(globalQueue.tasks))
|
||||||
|
for i, t := range globalQueue.tasks {
|
||||||
|
out[i] = *t
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
start, end = end, start
|
||||||
|
}
|
||||||
|
type interval struct {
|
||||||
|
start time.Time
|
||||||
|
end time.Time
|
||||||
|
}
|
||||||
|
active := make([]interval, 0, len(tasks))
|
||||||
|
for _, task := range tasks {
|
||||||
|
if task.StartedAt == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
intervalStart := task.StartedAt.UTC()
|
||||||
|
intervalEnd := now.UTC()
|
||||||
|
if task.DoneAt != nil {
|
||||||
|
intervalEnd = task.DoneAt.UTC()
|
||||||
|
}
|
||||||
|
if !intervalEnd.After(intervalStart) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if intervalEnd.Before(start) || intervalStart.After(end) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if intervalStart.Before(start) {
|
||||||
|
intervalStart = start
|
||||||
|
}
|
||||||
|
if intervalEnd.After(end) {
|
||||||
|
intervalEnd = end
|
||||||
|
}
|
||||||
|
active = append(active, interval{start: intervalStart, end: intervalEnd})
|
||||||
|
}
|
||||||
|
sort.Slice(active, func(i, j int) bool {
|
||||||
|
if active[i].start.Equal(active[j].start) {
|
||||||
|
return active[i].end.Before(active[j].end)
|
||||||
|
}
|
||||||
|
return active[i].start.Before(active[j].start)
|
||||||
|
})
|
||||||
|
merged := make([]interval, 0, len(active))
|
||||||
|
for _, span := range active {
|
||||||
|
if len(merged) == 0 {
|
||||||
|
merged = append(merged, span)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
last := &merged[len(merged)-1]
|
||||||
|
if !span.start.After(last.end) {
|
||||||
|
if span.end.After(last.end) {
|
||||||
|
last.end = span.end
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
merged = append(merged, span)
|
||||||
|
}
|
||||||
|
|
||||||
|
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
|
||||||
|
cursor := start
|
||||||
|
for _, span := range merged {
|
||||||
|
if span.start.After(cursor) {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
|
||||||
|
}
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
|
||||||
|
cursor = span.end
|
||||||
|
}
|
||||||
|
if cursor.Before(end) {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
|
||||||
|
}
|
||||||
|
if len(segments) == 0 {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
|
||||||
|
}
|
||||||
|
return segments
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
|
||||||
|
times := make([]time.Time, 0, len(samples))
|
||||||
|
for _, sample := range samples {
|
||||||
|
times = append(times, sample.Timestamp)
|
||||||
|
}
|
||||||
|
return times
|
||||||
|
}
|
||||||
|
|
||||||
|
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
|
||||||
|
min, max := 0.0, 1.0
|
||||||
|
if yMin != nil && yMax != nil {
|
||||||
|
min, max = *yMin, *yMax
|
||||||
|
} else {
|
||||||
|
min, max = chartSeriesBounds(flattenDatasets(datasets))
|
||||||
|
if yMin != nil {
|
||||||
|
min = *yMin
|
||||||
|
}
|
||||||
|
if yMax != nil {
|
||||||
|
max = *yMax
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ticks := chartNiceTicks(min, max, 8)
|
||||||
|
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
|
||||||
|
}
|
||||||
|
|
||||||
|
func flattenDatasets(datasets [][]float64) []float64 {
|
||||||
|
total := 0
|
||||||
|
for _, ds := range datasets {
|
||||||
|
total += len(ds)
|
||||||
|
}
|
||||||
|
out := make([]float64, 0, total)
|
||||||
|
for _, ds := range datasets {
|
||||||
|
out = append(out, ds...)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
|
||||||
|
legendRows := 0
|
||||||
|
if chartLegendVisible(seriesCount) && seriesCount > 0 {
|
||||||
|
cols := 4
|
||||||
|
if seriesCount < cols {
|
||||||
|
cols = seriesCount
|
||||||
|
}
|
||||||
|
legendRows = (seriesCount + cols - 1) / cols
|
||||||
|
}
|
||||||
|
legendHeight := 0
|
||||||
|
if legendRows > 0 {
|
||||||
|
legendHeight = legendRows*24 + 24
|
||||||
|
}
|
||||||
|
return chartLayout{
|
||||||
|
Width: 1400,
|
||||||
|
Height: canvasHeight,
|
||||||
|
PlotLeft: 96,
|
||||||
|
PlotRight: 1352,
|
||||||
|
PlotTop: 72,
|
||||||
|
PlotBottom: canvasHeight - 60 - legendHeight,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
|
||||||
|
if len(times) == 0 {
|
||||||
|
return time.Time{}, time.Time{}
|
||||||
|
}
|
||||||
|
start := times[0].UTC()
|
||||||
|
end := start
|
||||||
|
for _, ts := range times[1:] {
|
||||||
|
t := ts.UTC()
|
||||||
|
if t.Before(start) {
|
||||||
|
start = t
|
||||||
|
}
|
||||||
|
if t.After(end) {
|
||||||
|
end = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return start, end
|
||||||
|
}
|
||||||
|
|
||||||
|
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||||
|
if count <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if len(times) == count {
|
||||||
|
return times
|
||||||
|
}
|
||||||
|
if len(times) == 1 {
|
||||||
|
out := make([]time.Time, count)
|
||||||
|
for i := range out {
|
||||||
|
out[i] = times[0].Add(time.Duration(i) * time.Minute)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
|
||||||
|
out := make([]time.Time, count)
|
||||||
|
for i := range out {
|
||||||
|
out[i] = base.Add(time.Duration(i) * time.Minute)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||||
|
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSVGClose(b *strings.Builder) {
|
||||||
|
b.WriteString("</svg>\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||||
|
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||||
|
width/2, sanitizeChartText(title))
|
||||||
|
if strings.TrimSpace(subtitle) != "" {
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||||
|
width/2, sanitizeChartText(subtitle))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||||
|
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||||
|
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
|
||||||
|
for _, tick := range scale.Ticks {
|
||||||
|
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||||
|
layout.PlotLeft, y, layout.PlotRight, y)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
|
||||||
|
if pointCount <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
|
||||||
|
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||||
|
ts := chartPointTime(times, idx)
|
||||||
|
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||||
|
x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
|
||||||
|
for _, tick := range scale.Ticks {
|
||||||
|
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, y, layout.PlotLeft-6, y)
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
|
||||||
|
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
|
||||||
|
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||||
|
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
label := ""
|
||||||
|
if idx < len(labels) {
|
||||||
|
label = labels[idx]
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
|
||||||
|
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var points strings.Builder
|
||||||
|
for idx, value := range values {
|
||||||
|
if idx > 0 {
|
||||||
|
points.WriteByte(' ')
|
||||||
|
}
|
||||||
|
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
|
||||||
|
points.String(), color)
|
||||||
|
if len(values) == 1 {
|
||||||
|
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
peakIdx := 0
|
||||||
|
peakValue := values[0]
|
||||||
|
for idx, value := range values[1:] {
|
||||||
|
if value >= peakValue {
|
||||||
|
peakIdx = idx + 1
|
||||||
|
peakValue = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||||
|
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||||
|
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||||
|
if !chartLegendVisible(len(series)) || len(series) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cols := 4
|
||||||
|
if len(series) < cols {
|
||||||
|
cols = len(series)
|
||||||
|
}
|
||||||
|
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
|
||||||
|
baseY := layout.PlotBottom + 74
|
||||||
|
for i, item := range series {
|
||||||
|
row := i / cols
|
||||||
|
col := i % cols
|
||||||
|
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
|
||||||
|
y := float64(baseY + row*24)
|
||||||
|
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
|
||||||
|
x, y, x+28, y, item.Color)
|
||||||
|
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
|
||||||
|
x+38, y+4, sanitizeChartText(item.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||||
|
if len(segments) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
|
||||||
|
for _, segment := range segments {
|
||||||
|
if segment.Active || !segment.End.After(segment.Start) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
|
||||||
|
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||||
|
if len(segments) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
seen := map[int]bool{}
|
||||||
|
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
|
||||||
|
for i, segment := range segments {
|
||||||
|
if i > 0 {
|
||||||
|
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||||
|
if !seen[x] {
|
||||||
|
seen[x] = true
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if i < len(segments)-1 {
|
||||||
|
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||||
|
if !seen[x] {
|
||||||
|
seen[x] = true
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||||
|
if !end.After(start) {
|
||||||
|
return float64(left+right) / 2
|
||||||
|
}
|
||||||
|
if ts.Before(start) {
|
||||||
|
ts = start
|
||||||
|
}
|
||||||
|
if ts.After(end) {
|
||||||
|
ts = end
|
||||||
|
}
|
||||||
|
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
|
||||||
|
return float64(left) + ratio*float64(right-left)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartPointTime(times []time.Time, idx int) time.Time {
|
||||||
|
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
|
||||||
|
return times[idx].UTC()
|
||||||
|
}
|
||||||
|
if len(times) > 0 && !times[0].IsZero() {
|
||||||
|
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
|
||||||
|
}
|
||||||
|
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
|
||||||
|
if scale.Max <= scale.Min {
|
||||||
|
return float64(plotTop+plotBottom) / 2
|
||||||
|
}
|
||||||
|
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartSeriesBounds(values []float64) (float64, float64) {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
min, max := values[0], values[0]
|
||||||
|
for _, value := range values[1:] {
|
||||||
|
if value < min {
|
||||||
|
min = value
|
||||||
|
}
|
||||||
|
if value > max {
|
||||||
|
max = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if min == max {
|
||||||
|
if max == 0 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
pad := math.Abs(max) * 0.1
|
||||||
|
if pad == 0 {
|
||||||
|
pad = 1
|
||||||
|
}
|
||||||
|
min -= pad
|
||||||
|
max += pad
|
||||||
|
}
|
||||||
|
if min > 0 {
|
||||||
|
pad := (max - min) * 0.2
|
||||||
|
if pad == 0 {
|
||||||
|
pad = max * 0.1
|
||||||
|
}
|
||||||
|
min -= pad
|
||||||
|
if min < 0 {
|
||||||
|
min = 0
|
||||||
|
}
|
||||||
|
max += pad
|
||||||
|
}
|
||||||
|
return min, max
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartNiceTicks(min, max float64, target int) []float64 {
|
||||||
|
if min == max {
|
||||||
|
max = min + 1
|
||||||
|
}
|
||||||
|
span := max - min
|
||||||
|
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
|
||||||
|
for _, factor := range []float64{1, 2, 5, 10} {
|
||||||
|
if span/(factor*step) <= float64(target)*1.5 {
|
||||||
|
step = factor * step
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
low := math.Floor(min/step) * step
|
||||||
|
high := math.Ceil(max/step) * step
|
||||||
|
var ticks []float64
|
||||||
|
for value := low; value <= high+step*0.001; value += step {
|
||||||
|
ticks = append(ticks, math.Round(value*1e9)/1e9)
|
||||||
|
}
|
||||||
|
return ticks
|
||||||
|
}
|
||||||
|
|
||||||
|
func valueClamp(value float64, scale chartScale) float64 {
|
||||||
|
if value < scale.Min {
|
||||||
|
return scale.Min
|
||||||
|
}
|
||||||
|
if value > scale.Max {
|
||||||
|
return scale.Max
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartStatsLabel(datasets [][]float64) string {
|
||||||
|
mn, avg, mx := globalStats(datasets)
|
||||||
|
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("min %s avg %s max %s",
|
||||||
|
chartLegendNumber(mn),
|
||||||
|
chartLegendNumber(avg),
|
||||||
|
chartLegendNumber(mx),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDisplayLabel(idx int) string {
|
||||||
|
if name := gpuModelNameByIndex(idx); name != "" {
|
||||||
|
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("GPU %d", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuModelNameByIndex(idx int) string {
|
||||||
|
now := time.Now()
|
||||||
|
gpuLabelCache.mu.Lock()
|
||||||
|
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||||
|
gpuLabelCache.loadedAt = now
|
||||||
|
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||||
|
gpuLabelCache.mu.Unlock()
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadGPUModelNames() map[int]string {
|
||||||
|
out := map[int]string{}
|
||||||
|
gpus, err := platform.New().ListNvidiaGPUs()
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
name := strings.TrimSpace(gpu.Name)
|
||||||
|
if name != "" {
|
||||||
|
out[gpu.Index] = name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
137
audit/internal/webui/jobs.go
Normal file
137
audit/internal/webui/jobs.go
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// jobState holds the output lines and completion status of an async job.
|
||||||
|
type jobState struct {
|
||||||
|
lines []string
|
||||||
|
done bool
|
||||||
|
err string
|
||||||
|
mu sync.Mutex
|
||||||
|
subs []chan string
|
||||||
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
|
logPath string
|
||||||
|
}
|
||||||
|
|
||||||
|
// abort cancels the job if it has a cancel function and is not yet done.
|
||||||
|
func (j *jobState) abort() bool {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
if j.done || j.cancel == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
j.cancel()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) append(line string) {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
j.lines = append(j.lines, line)
|
||||||
|
if j.logPath != "" {
|
||||||
|
appendJobLog(j.logPath, line)
|
||||||
|
}
|
||||||
|
for _, ch := range j.subs {
|
||||||
|
select {
|
||||||
|
case ch <- line:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) finish(errMsg string) {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
j.done = true
|
||||||
|
j.err = errMsg
|
||||||
|
for _, ch := range j.subs {
|
||||||
|
close(ch)
|
||||||
|
}
|
||||||
|
j.subs = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// subscribe returns a channel that receives all future lines.
|
||||||
|
// Existing lines are returned first, then the channel streams new ones.
|
||||||
|
func (j *jobState) subscribe() ([]string, <-chan string) {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
existing := make([]string, len(j.lines))
|
||||||
|
copy(existing, j.lines)
|
||||||
|
if j.done {
|
||||||
|
return existing, nil
|
||||||
|
}
|
||||||
|
ch := make(chan string, 256)
|
||||||
|
j.subs = append(j.subs, ch)
|
||||||
|
return existing, ch
|
||||||
|
}
|
||||||
|
|
||||||
|
// jobManager manages async jobs identified by string IDs.
|
||||||
|
type jobManager struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
jobs map[string]*jobState
|
||||||
|
}
|
||||||
|
|
||||||
|
var globalJobs = &jobManager{jobs: make(map[string]*jobState)}
|
||||||
|
|
||||||
|
func (m *jobManager) create(id string) *jobState {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
j := &jobState{}
|
||||||
|
m.jobs[id] = j
|
||||||
|
// Schedule cleanup after 30 minutes
|
||||||
|
goRecoverOnce("job cleanup", func() {
|
||||||
|
time.Sleep(30 * time.Minute)
|
||||||
|
m.mu.Lock()
|
||||||
|
delete(m.jobs, id)
|
||||||
|
m.mu.Unlock()
|
||||||
|
})
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
|
||||||
|
// isDone returns true if the job has finished (either successfully or with error).
|
||||||
|
func (j *jobState) isDone() bool {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
return j.done
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *jobManager) get(id string) (*jobState, bool) {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
j, ok := m.jobs[id]
|
||||||
|
return j, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func newTaskJobState(logPath string) *jobState {
|
||||||
|
j := &jobState{logPath: logPath}
|
||||||
|
if logPath == "" {
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(logPath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
|
||||||
|
if len(lines) > 0 && lines[len(lines)-1] == "" {
|
||||||
|
lines = lines[:len(lines)-1]
|
||||||
|
}
|
||||||
|
j.lines = append(j.lines, lines...)
|
||||||
|
return j
|
||||||
|
}
|
||||||
|
|
||||||
|
func appendJobLog(path, line string) {
|
||||||
|
if path == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
_, _ = f.WriteString(line + "\n")
|
||||||
|
}
|
||||||
242
audit/internal/webui/kmsg_watcher.go
Normal file
242
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||||
|
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||||
|
// while any SAT task is running, and flushed when all tasks complete.
|
||||||
|
type kmsgWatcher struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
activeCount int // number of in-flight SAT tasks
|
||||||
|
window *kmsgWindow
|
||||||
|
statusDB *app.ComponentStatusDB
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgWindow struct {
|
||||||
|
targets []string // SAT targets running concurrently
|
||||||
|
startedAt time.Time
|
||||||
|
seen map[kmsgEventKey]bool
|
||||||
|
events []kmsgEvent
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEventKey struct {
|
||||||
|
id string // BDF or device name
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEvent struct {
|
||||||
|
timestamp time.Time
|
||||||
|
raw string
|
||||||
|
ids []string // BDF addresses or device names extracted
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||||
|
return &kmsgWatcher{statusDB: statusDB}
|
||||||
|
}
|
||||||
|
|
||||||
|
// start launches the background kmsg reading goroutine.
|
||||||
|
func (w *kmsgWatcher) start() {
|
||||||
|
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) run() {
|
||||||
|
for {
|
||||||
|
f, err := os.Open("/dev/kmsg")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||||
|
time.Sleep(30 * time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Best-effort seek to end so we only capture events from now forward.
|
||||||
|
_, _ = f.Seek(0, io.SeekEnd)
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
evt, ok := parseKmsgLine(line)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.window != nil {
|
||||||
|
w.recordEvent(evt)
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
slog.Warn("kmsg watcher stopped", "err", err)
|
||||||
|
}
|
||||||
|
_ = f.Close()
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||||
|
// Must be called with w.mu held.
|
||||||
|
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
key := kmsgEventKey{id: "", category: evt.category}
|
||||||
|
if !w.window.seen[key] {
|
||||||
|
w.window.seen[key] = true
|
||||||
|
w.window.events = append(w.window.events, evt)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
key := kmsgEventKey{id: id, category: evt.category}
|
||||||
|
if !w.window.seen[key] {
|
||||||
|
w.window.seen[key] = true
|
||||||
|
w.window.events = append(w.window.events, evt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||||
|
// if this is the first task starting.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
if w.activeCount == 0 {
|
||||||
|
w.window = &kmsgWindow{
|
||||||
|
startedAt: time.Now(),
|
||||||
|
seen: make(map[kmsgEventKey]bool),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.activeCount++
|
||||||
|
if w.window != nil {
|
||||||
|
w.window.targets = append(w.window.targets, target)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||||
|
// it flushes the accumulated events to the status DB.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
w.activeCount--
|
||||||
|
var window *kmsgWindow
|
||||||
|
if w.activeCount <= 0 {
|
||||||
|
w.activeCount = 0
|
||||||
|
window = w.window
|
||||||
|
w.window = nil
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
if window == nil || len(window.events) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||||
|
if w.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
source := "watchdog:kmsg"
|
||||||
|
// Collect unique component keys from events.
|
||||||
|
seen := map[string]string{} // componentKey → first raw line
|
||||||
|
for _, evt := range window.events {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
// MCE or un-identified error.
|
||||||
|
key := "cpu:all"
|
||||||
|
if evt.category == "memory" {
|
||||||
|
key = "memory:all"
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
var key string
|
||||||
|
switch evt.category {
|
||||||
|
case "gpu", "pcie":
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
case "storage":
|
||||||
|
key = "storage:" + id
|
||||||
|
default:
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for key, detail := range seen {
|
||||||
|
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||||
|
w.statusDB.Record(key, source, "Warning", detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||||
|
// any pattern in platform.HardwareErrorPatterns.
|
||||||
|
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||||
|
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||||
|
msg := raw
|
||||||
|
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||||
|
msg = strings.TrimSpace(raw[idx+1:])
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, p := range platform.HardwareErrorPatterns {
|
||||||
|
m := p.Re.FindStringSubmatch(msg)
|
||||||
|
if m == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
evt := kmsgEvent{
|
||||||
|
timestamp: time.Now(),
|
||||||
|
raw: msg,
|
||||||
|
category: p.Category,
|
||||||
|
}
|
||||||
|
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||||
|
}
|
||||||
|
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||||
|
}
|
||||||
|
return evt, true
|
||||||
|
}
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||||
|
func normalizeBDF(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
return "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncate(s string, max int) string {
|
||||||
|
if len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return s[:max] + "..."
|
||||||
|
}
|
||||||
|
|
||||||
|
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||||
|
func isSATTarget(target string) bool {
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||||
|
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||||
|
"platform-stress":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
393
audit/internal/webui/metricsdb.go
Normal file
393
audit/internal/webui/metricsdb.go
Normal file
@@ -0,0 +1,393 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"encoding/csv"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
_ "modernc.org/sqlite"
|
||||||
|
)
|
||||||
|
|
||||||
|
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||||
|
|
||||||
|
// MetricsDB persists live metric samples to SQLite.
|
||||||
|
type MetricsDB struct {
|
||||||
|
db *sql.DB
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *MetricsDB) Close() error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return m.db.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||||
|
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
db.SetMaxOpenConns(1)
|
||||||
|
if err := initMetricsSchema(db); err != nil {
|
||||||
|
_ = db.Close()
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &MetricsDB{db: db}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func initMetricsSchema(db *sql.DB) error {
|
||||||
|
_, err := db.Exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
cpu_load_pct REAL,
|
||||||
|
mem_load_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
gpu_index INTEGER NOT NULL,
|
||||||
|
temp_c REAL,
|
||||||
|
usage_pct REAL,
|
||||||
|
mem_usage_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
clock_mhz REAL,
|
||||||
|
mem_clock_mhz REAL,
|
||||||
|
PRIMARY KEY (ts, gpu_index)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
rpm REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
grp TEXT NOT NULL,
|
||||||
|
celsius REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||||
|
rows, err := db.Query("PRAGMA table_info(" + table + ")")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var cid int
|
||||||
|
var name, ctype string
|
||||||
|
var notNull, pk int
|
||||||
|
var dflt sql.NullString
|
||||||
|
if err := rows.Scan(&cid, &name, &ctype, ¬Null, &dflt, &pk); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if strings.EqualFold(name, column) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write inserts one sample into all relevant tables.
|
||||||
|
func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||||
|
ts := s.Timestamp.Unix()
|
||||||
|
tx, err := m.db.Begin()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||||
|
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
|
||||||
|
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, f := range s.Fans {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
|
||||||
|
ts, f.Name, f.RPM,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
_, err = tx.Exec(
|
||||||
|
`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
|
||||||
|
ts, t.Name, t.Group, t.Celsius,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
|
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||||
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadBetween returns samples in chronological order within the given time window.
|
||||||
|
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||||
|
if m == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
start, end = end, start
|
||||||
|
}
|
||||||
|
return m.loadSamples(
|
||||||
|
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||||
|
start.Unix(), end.Unix(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||||
|
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||||
|
rows, err := m.db.Query(query, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
type sysRow struct {
|
||||||
|
ts int64
|
||||||
|
cpu, mem, pwr float64
|
||||||
|
}
|
||||||
|
var sysRows []sysRow
|
||||||
|
for rows.Next() {
|
||||||
|
var r sysRow
|
||||||
|
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sysRows = append(sysRows, r)
|
||||||
|
}
|
||||||
|
if len(sysRows) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
// Collect min/max ts for range query
|
||||||
|
minTS := sysRows[0].ts
|
||||||
|
maxTS := sysRows[len(sysRows)-1].ts
|
||||||
|
|
||||||
|
// Load GPU rows in range
|
||||||
|
type gpuKey struct {
|
||||||
|
ts int64
|
||||||
|
idx int
|
||||||
|
}
|
||||||
|
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||||
|
gRows, err := m.db.Query(
|
||||||
|
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||||
|
minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer gRows.Close()
|
||||||
|
for gRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var g platform.GPUMetricRow
|
||||||
|
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
|
||||||
|
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load fan rows in range
|
||||||
|
type fanKey struct {
|
||||||
|
ts int64
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
fanData := map[fanKey]float64{}
|
||||||
|
fRows, err := m.db.Query(
|
||||||
|
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer fRows.Close()
|
||||||
|
for fRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var name string
|
||||||
|
var rpm float64
|
||||||
|
if err := fRows.Scan(&ts, &name, &rpm); err == nil {
|
||||||
|
fanData[fanKey{ts, name}] = rpm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load temp rows in range
|
||||||
|
type tempKey struct {
|
||||||
|
ts int64
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
tempData := map[tempKey]platform.TempReading{}
|
||||||
|
tRows, err := m.db.Query(
|
||||||
|
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||||
|
)
|
||||||
|
if err == nil {
|
||||||
|
defer tRows.Close()
|
||||||
|
for tRows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var t platform.TempReading
|
||||||
|
if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
|
||||||
|
tempData[tempKey{ts, t.Name}] = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||||
|
// Sort each list so that sample reconstruction is deterministic regardless
|
||||||
|
// of Go's non-deterministic map iteration order.
|
||||||
|
seenGPU := map[int]bool{}
|
||||||
|
var gpuIndices []int
|
||||||
|
for k := range gpuData {
|
||||||
|
if !seenGPU[k.idx] {
|
||||||
|
seenGPU[k.idx] = true
|
||||||
|
gpuIndices = append(gpuIndices, k.idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Ints(gpuIndices)
|
||||||
|
|
||||||
|
seenFan := map[string]bool{}
|
||||||
|
var fanNames []string
|
||||||
|
for k := range fanData {
|
||||||
|
if !seenFan[k.name] {
|
||||||
|
seenFan[k.name] = true
|
||||||
|
fanNames = append(fanNames, k.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Strings(fanNames)
|
||||||
|
|
||||||
|
seenTemp := map[string]bool{}
|
||||||
|
var tempNames []string
|
||||||
|
for k := range tempData {
|
||||||
|
if !seenTemp[k.name] {
|
||||||
|
seenTemp[k.name] = true
|
||||||
|
tempNames = append(tempNames, k.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Strings(tempNames)
|
||||||
|
|
||||||
|
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||||
|
for i, r := range sysRows {
|
||||||
|
s := platform.LiveMetricSample{
|
||||||
|
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||||
|
CPULoadPct: r.cpu,
|
||||||
|
MemLoadPct: r.mem,
|
||||||
|
PowerW: r.pwr,
|
||||||
|
}
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||||
|
s.GPUs = append(s.GPUs, g)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, name := range fanNames {
|
||||||
|
if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
|
||||||
|
s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, name := range tempNames {
|
||||||
|
if t, ok := tempData[tempKey{r.ts, name}]; ok {
|
||||||
|
s.Temps = append(s.Temps, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
samples[i] = s
|
||||||
|
}
|
||||||
|
return samples, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||||
|
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||||
|
rows, err := m.db.Query(`
|
||||||
|
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||||
|
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
|
||||||
|
g.clock_mhz, g.mem_clock_mhz
|
||||||
|
FROM sys_metrics s
|
||||||
|
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||||
|
ORDER BY s.ts, g.gpu_index
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
cw := csv.NewWriter(w)
|
||||||
|
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
|
||||||
|
for rows.Next() {
|
||||||
|
var ts int64
|
||||||
|
var cpu, mem, pwr float64
|
||||||
|
var gpuIdx sql.NullInt64
|
||||||
|
var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
|
||||||
|
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
row := []string{
|
||||||
|
strconv.FormatInt(ts, 10),
|
||||||
|
strconv.FormatFloat(cpu, 'f', 2, 64),
|
||||||
|
strconv.FormatFloat(mem, 'f', 2, 64),
|
||||||
|
strconv.FormatFloat(pwr, 'f', 1, 64),
|
||||||
|
}
|
||||||
|
if gpuIdx.Valid {
|
||||||
|
row = append(row,
|
||||||
|
strconv.FormatInt(gpuIdx.Int64, 10),
|
||||||
|
strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
row = append(row, "", "", "", "", "", "", "")
|
||||||
|
}
|
||||||
|
_ = cw.Write(row)
|
||||||
|
}
|
||||||
|
cw.Flush()
|
||||||
|
return cw.Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
func nullFloat(v float64) sql.NullFloat64 {
|
||||||
|
return sql.NullFloat64{Float64: v, Valid: true}
|
||||||
|
}
|
||||||
174
audit/internal/webui/metricsdb_test.go
Normal file
174
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
_ "modernc.org/sqlite"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||||
|
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
base := time.Unix(1_700_000_000, 0).UTC()
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||||
|
CPULoadPct: float64(10 + i),
|
||||||
|
MemLoadPct: float64(20 + i),
|
||||||
|
PowerW: float64(300 + i),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||||
|
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Write(%d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
all, err := db.LoadAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadAll: %v", err)
|
||||||
|
}
|
||||||
|
if len(all) != 3 {
|
||||||
|
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||||
|
}
|
||||||
|
for i, sample := range all {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||||
|
}
|
||||||
|
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
recent, err := db.LoadRecent(2)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadRecent: %v", err)
|
||||||
|
}
|
||||||
|
if len(recent) != 2 {
|
||||||
|
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||||
|
}
|
||||||
|
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||||
|
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||||
|
}
|
||||||
|
for i, sample := range recent {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "metrics.db")
|
||||||
|
raw, err := sql.Open("sqlite", path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("sql.Open: %v", err)
|
||||||
|
}
|
||||||
|
_, err = raw.Exec(`
|
||||||
|
CREATE TABLE gpu_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
gpu_index INTEGER NOT NULL,
|
||||||
|
temp_c REAL,
|
||||||
|
usage_pct REAL,
|
||||||
|
mem_usage_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts, gpu_index)
|
||||||
|
);
|
||||||
|
CREATE TABLE sys_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
cpu_load_pct REAL,
|
||||||
|
mem_load_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts)
|
||||||
|
);
|
||||||
|
CREATE TABLE fan_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
rpm REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
CREATE TABLE temp_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
grp TEXT NOT NULL,
|
||||||
|
celsius REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("create legacy schema: %v", err)
|
||||||
|
}
|
||||||
|
_ = raw.Close()
|
||||||
|
|
||||||
|
db, err := openMetricsDB(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
now := time.Unix(1_700_000_100, 0).UTC()
|
||||||
|
err = db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: now,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
samples, err := db.LoadAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadAll: %v", err)
|
||||||
|
}
|
||||||
|
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
|
||||||
|
t.Fatalf("samples=%+v", samples)
|
||||||
|
}
|
||||||
|
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
|
||||||
|
t.Fatalf("ClockMHz=%v want 1410", got)
|
||||||
|
}
|
||||||
|
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
|
||||||
|
t.Fatalf("MemClockMHz=%v want 2600", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
|
||||||
|
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
base := time.Unix(1_700_000_000, 0).UTC()
|
||||||
|
for i := 0; i < 5; i++ {
|
||||||
|
if err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base.Add(time.Duration(i) * time.Minute),
|
||||||
|
CPULoadPct: float64(i),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("Write(%d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadBetween: %v", err)
|
||||||
|
}
|
||||||
|
if len(got) != 3 {
|
||||||
|
t.Fatalf("LoadBetween len=%d want 3", len(got))
|
||||||
|
}
|
||||||
|
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
|
||||||
|
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
|
||||||
|
}
|
||||||
|
}
|
||||||
2821
audit/internal/webui/pages.go
Normal file
2821
audit/internal/webui/pages.go
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -7,9 +7,457 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestRootRendersShellWithIframe(t *testing.T) {
|
func TestChartLegendNumber(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in float64
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{in: 0.4, want: "0"},
|
||||||
|
{in: 61.5, want: "62"},
|
||||||
|
{in: 999.4, want: "999"},
|
||||||
|
{in: 1200, want: "1,2k"},
|
||||||
|
{in: 1250, want: "1,25k"},
|
||||||
|
{in: 1310, want: "1,31k"},
|
||||||
|
{in: 1500, want: "1,5k"},
|
||||||
|
{in: 2600, want: "2,6k"},
|
||||||
|
{in: 10200, want: "10k"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := chartLegendNumber(tc.in); got != tc.want {
|
||||||
|
t.Fatalf("chartLegendNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
|
||||||
|
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
panic("boom")
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
|
||||||
|
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusInternalServerError {
|
||||||
|
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), "internal server error") {
|
||||||
|
t.Fatalf("body=%q", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
|
||||||
|
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if !sseStart(w) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !sseWrite(w, "tick", "ok") {
|
||||||
|
t.Fatal("expected sse write to succeed")
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/stream", nil)
|
||||||
|
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
|
||||||
|
t.Fatalf("content-type=%q", got)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
|
||||||
|
t.Fatalf("body=%q", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||||
|
CPULoadPct: 10,
|
||||||
|
MemLoadPct: 20,
|
||||||
|
PowerW: 300,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
CPULoadPct: 30,
|
||||||
|
MemLoadPct: 40,
|
||||||
|
PowerW: 320,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
CPULoadPct: 50,
|
||||||
|
MemLoadPct: 60,
|
||||||
|
PowerW: 340,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if len(names) != 1 || names[0] != "GPU 0" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != len(samples) {
|
||||||
|
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||||
|
}
|
||||||
|
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||||
|
t.Fatalf("datasets shape=%v", datasets)
|
||||||
|
}
|
||||||
|
if got := datasets[0][0]; got != 120 {
|
||||||
|
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||||
|
}
|
||||||
|
if got := datasets[0][2]; got != 130 {
|
||||||
|
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 7, PowerW: 170},
|
||||||
|
{GPUIndex: 2, PowerW: 120},
|
||||||
|
{GPUIndex: 0, PowerW: 100},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: 101},
|
||||||
|
{GPUIndex: 7, PowerW: 171},
|
||||||
|
{GPUIndex: 2, PowerW: 121},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||||
|
if len(names) != len(wantNames) {
|
||||||
|
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||||
|
}
|
||||||
|
for i := range wantNames {
|
||||||
|
if names[i] != wantNames[i] {
|
||||||
|
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||||
|
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||||
|
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||||
|
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1400},
|
||||||
|
{GPUIndex: 3, ClockMHz: 1500},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1410},
|
||||||
|
{GPUIndex: 3, ClockMHz: 1510},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("gpu-all-clock returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Core Clock" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if got := datasets[1][1]; got != 1510 {
|
||||||
|
t.Fatalf("GPU 3 core clock=%v want 1510", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||||
|
want := []float64{0, 480, 480, 480, 510, 510}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||||
|
body := renderMetrics()
|
||||||
|
if !strings.Contains(body, "const probe = new Image();") {
|
||||||
|
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||||
|
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
|
||||||
|
t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="gpu-chart-toggle"`) {
|
||||||
|
t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
|
||||||
|
t.Fatalf("metrics page should include GPU core clock chart: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
||||||
|
t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
|
||||||
|
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartLegendVisible(t *testing.T) {
|
||||||
|
if !chartLegendVisible(8) {
|
||||||
|
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||||
|
}
|
||||||
|
if chartLegendVisible(9) {
|
||||||
|
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartYAxisNumber(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in float64
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{in: 999, want: "999"},
|
||||||
|
{in: 1000, want: "1к"},
|
||||||
|
{in: 1370, want: "1,4к"},
|
||||||
|
{in: 1500, want: "1,5к"},
|
||||||
|
{in: 1700, want: "1,7к"},
|
||||||
|
{in: 2000, want: "2к"},
|
||||||
|
{in: 9999, want: "10к"},
|
||||||
|
{in: 10200, want: "10к"},
|
||||||
|
{in: -1500, want: "-1,5к"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||||
|
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartCanvasHeight(t *testing.T) {
|
||||||
|
if got := chartCanvasHeight(4); got != 360 {
|
||||||
|
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||||
|
}
|
||||||
|
if got := chartCanvasHeight(12); got != 288 {
|
||||||
|
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
end := start.Add(10 * time.Minute)
|
||||||
|
taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
|
||||||
|
s := start.Add(offsetStart)
|
||||||
|
e := start.Add(offsetEnd)
|
||||||
|
return Task{
|
||||||
|
Name: "task",
|
||||||
|
Status: TaskDone,
|
||||||
|
StartedAt: &s,
|
||||||
|
DoneAt: &e,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
segments := chartTimelineSegmentsForRange(start, end, end, []Task{
|
||||||
|
taskWindow(1*time.Minute, 3*time.Minute),
|
||||||
|
taskWindow(2*time.Minute, 5*time.Minute),
|
||||||
|
taskWindow(7*time.Minute, 8*time.Minute),
|
||||||
|
})
|
||||||
|
if len(segments) != 5 {
|
||||||
|
t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
|
||||||
|
}
|
||||||
|
wantActive := []bool{false, true, false, true, false}
|
||||||
|
wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
|
||||||
|
for i, segment := range segments {
|
||||||
|
if segment.Active != wantActive[i] {
|
||||||
|
t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
|
||||||
|
}
|
||||||
|
if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
|
||||||
|
t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
|
||||||
|
}
|
||||||
|
if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
|
||||||
|
t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
labels := []string{"12:00", "12:01", "12:02"}
|
||||||
|
times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
|
||||||
|
svg, err := renderMetricChartSVG(
|
||||||
|
"System Power",
|
||||||
|
labels,
|
||||||
|
times,
|
||||||
|
[][]float64{{300, 320, 310}},
|
||||||
|
[]string{"Power W"},
|
||||||
|
floatPtr(0),
|
||||||
|
floatPtr(400),
|
||||||
|
360,
|
||||||
|
[]chartTimelineSegment{
|
||||||
|
{Start: start, End: start.Add(time.Minute), Active: false},
|
||||||
|
{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
body := string(svg)
|
||||||
|
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||||
|
t.Fatalf("svg missing timeline overlay: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `opacity="0.10"`) {
|
||||||
|
t.Fatalf("svg missing idle overlay opacity: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `System Power`) {
|
||||||
|
t.Fatalf("svg missing chart title: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { _ = db.db.Close() })
|
||||||
|
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
for i, sample := range []platform.LiveMetricSample{
|
||||||
|
{Timestamp: start, PowerW: 300},
|
||||||
|
{Timestamp: start.Add(time.Minute), PowerW: 320},
|
||||||
|
{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
|
||||||
|
} {
|
||||||
|
if err := db.Write(sample); err != nil {
|
||||||
|
t.Fatalf("write sample %d: %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
prevTasks := globalQueue.tasks
|
||||||
|
s := start.Add(30 * time.Second)
|
||||||
|
e := start.Add(90 * time.Second)
|
||||||
|
globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = prevTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
|
||||||
|
h.handleMetricsChartSVG(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||||
|
t.Fatalf("custom svg response missing timeline overlay: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `stroke-linecap="round"`) {
|
||||||
|
t.Fatalf("custom svg response missing custom polyline styling: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||||
|
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||||
|
r1 := newMetricsRing(4)
|
||||||
|
r2 := newMetricsRing(4)
|
||||||
|
r1.push(1000)
|
||||||
|
r1.push(1100)
|
||||||
|
r2.push(1200)
|
||||||
|
r2.push(1300)
|
||||||
|
|
||||||
|
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||||
|
if len(datasets) != 2 {
|
||||||
|
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||||
|
}
|
||||||
|
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != 2 {
|
||||||
|
t.Fatalf("labels=%v want 2 entries", labels)
|
||||||
|
}
|
||||||
|
if labels[0] == "" || labels[1] == "" {
|
||||||
|
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||||
|
body := renderNetworkInline()
|
||||||
|
if !strings.Contains(body, "d.pending_change") {
|
||||||
|
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||||
|
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||||
|
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRootRendersDashboard(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
exportDir := filepath.Join(dir, "export")
|
exportDir := filepath.Join(dir, "export")
|
||||||
@@ -21,9 +469,10 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
handler := NewHandler(HandlerOptions{
|
handler := NewHandler(HandlerOptions{
|
||||||
Title: "Bee Hardware Audit",
|
Title: "Bee Hardware Audit",
|
||||||
AuditPath: path,
|
BuildLabel: "1.2.3",
|
||||||
ExportDir: exportDir,
|
AuditPath: path,
|
||||||
|
ExportDir: exportDir,
|
||||||
})
|
})
|
||||||
|
|
||||||
first := httptest.NewRecorder()
|
first := httptest.NewRecorder()
|
||||||
@@ -31,11 +480,17 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
|||||||
if first.Code != http.StatusOK {
|
if first.Code != http.StatusOK {
|
||||||
t.Fatalf("first status=%d", first.Code)
|
t.Fatalf("first status=%d", first.Code)
|
||||||
}
|
}
|
||||||
if !strings.Contains(first.Body.String(), `iframe`) || !strings.Contains(first.Body.String(), `src="/viewer"`) {
|
// Dashboard should contain the audit nav link and hardware summary
|
||||||
t.Fatalf("first body missing iframe viewer: %s", first.Body.String())
|
if !strings.Contains(first.Body.String(), `href="/audit"`) {
|
||||||
|
t.Fatalf("first body missing audit nav link: %s", first.Body.String())
|
||||||
}
|
}
|
||||||
if !strings.Contains(first.Body.String(), "/export/support.tar.gz") {
|
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||||
t.Fatalf("first body missing support bundle link: %s", first.Body.String())
|
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||||
|
}
|
||||||
|
versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
|
||||||
|
navIdx := strings.Index(first.Body.String(), `href="/"`)
|
||||||
|
if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
|
||||||
|
t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
|
||||||
}
|
}
|
||||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||||
t.Fatalf("first cache-control=%q", got)
|
t.Fatalf("first cache-control=%q", got)
|
||||||
@@ -50,8 +505,221 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
|||||||
if second.Code != http.StatusOK {
|
if second.Code != http.StatusOK {
|
||||||
t.Fatalf("second status=%d", second.Code)
|
t.Fatalf("second status=%d", second.Code)
|
||||||
}
|
}
|
||||||
if !strings.Contains(second.Body.String(), `src="/viewer"`) {
|
if !strings.Contains(second.Body.String(), `Hardware Summary`) {
|
||||||
t.Fatalf("second body missing iframe viewer: %s", second.Body.String())
|
t.Fatalf("second body missing hardware summary: %s", second.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{
|
||||||
|
Title: "Bee Hardware Audit",
|
||||||
|
AuditPath: filepath.Join(dir, "missing-audit.json"),
|
||||||
|
ExportDir: exportDir,
|
||||||
|
})
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
|
||||||
|
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, `No audit data`) {
|
||||||
|
t.Fatalf("dashboard still shows empty audit badge: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(rec.Body.String()) != "ready" {
|
||||||
|
t.Fatalf("body=%q want ready", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `iframe class="viewer-frame" src="/viewer"`) {
|
||||||
|
t.Fatalf("audit page missing viewer frame: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `openAuditModal()`) {
|
||||||
|
t.Fatalf("audit page missing action modal trigger: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
|
||||||
|
t.Fatalf("tasks page missing task report hint: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||||
|
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
|
||||||
|
t.Fatalf("tasks page missing pagination controls: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||||
|
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
||||||
|
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Export to USB`) {
|
||||||
|
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||||
|
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`href="/benchmark"`,
|
||||||
|
`id="benchmark-gpu-list"`,
|
||||||
|
`/api/gpu/nvidia`,
|
||||||
|
`/api/benchmark/nvidia/run`,
|
||||||
|
`benchmark-run-nccl`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA GPU Targeted Stress`,
|
||||||
|
`nvidia-targeted-stress`,
|
||||||
|
`controlled NVIDIA DCGM load`,
|
||||||
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA Max Compute Load`,
|
||||||
|
`dcgmproftester`,
|
||||||
|
`targeted_stress remain in <a href="/validate">Validate</a>`,
|
||||||
|
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
|
||||||
|
`id="burn-gpu-list"`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("burn page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskDetailPageRendersSavedReport(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
|
||||||
|
if err := os.MkdirAll(reportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
reportPath := filepath.Join(reportDir, "report.html")
|
||||||
|
if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
ArtifactsDir: reportDir,
|
||||||
|
ReportHTMLPath: reportPath,
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `saved report`) {
|
||||||
|
t.Fatalf("task detail page missing saved report: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Back to Tasks`) {
|
||||||
|
t.Fatalf("task detail page missing back link: %s", body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,8 +771,8 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
|||||||
if rec.Code != http.StatusOK {
|
if rec.Code != http.StatusOK {
|
||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
if got := strings.TrimSpace(rec.Body.String()); got != body {
|
if !strings.Contains(rec.Body.String(), "SERIAL-API") {
|
||||||
t.Fatalf("body=%q want %q", got, body)
|
t.Fatalf("body missing expected serial: %s", rec.Body.String())
|
||||||
}
|
}
|
||||||
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
||||||
t.Fatalf("content-type=%q", got)
|
t.Fatalf("content-type=%q", got)
|
||||||
@@ -129,6 +797,17 @@ func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
|||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
archive, err := os.CreateTemp(os.TempDir(), "bee-support-server-test-*.tar.gz")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { _ = os.Remove(archive.Name()) })
|
||||||
|
if _, err := archive.WriteString("support-bundle"); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := archive.Close(); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
@@ -165,3 +844,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
|
|||||||
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
health := `{
|
||||||
|
"status":"PARTIAL",
|
||||||
|
"checked_at":"2026-03-16T10:00:00Z",
|
||||||
|
"export_dir":"/tmp/export",
|
||||||
|
"driver_ready":true,
|
||||||
|
"cuda_ready":false,
|
||||||
|
"network_status":"PARTIAL",
|
||||||
|
"issues":[
|
||||||
|
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
|
||||||
|
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
|
||||||
|
],
|
||||||
|
"tools":[
|
||||||
|
{"name":"dmidecode","ok":true},
|
||||||
|
{"name":"nvidia-smi","ok":false}
|
||||||
|
],
|
||||||
|
"services":[
|
||||||
|
{"name":"bee-web","status":"active"},
|
||||||
|
{"name":"bee-nvidia","status":"inactive"}
|
||||||
|
]
|
||||||
|
}`
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
componentStatus := `[
|
||||||
|
{
|
||||||
|
"component_key":"cpu:all",
|
||||||
|
"status":"Warning",
|
||||||
|
"error_summary":"cpu SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"memory:all",
|
||||||
|
"status":"OK",
|
||||||
|
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"storage:nvme0n1",
|
||||||
|
"status":"Critical",
|
||||||
|
"error_summary":"storage SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"pcie:gpu:nvidia",
|
||||||
|
"status":"Warning",
|
||||||
|
"error_summary":"nvidia SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
|
||||||
|
}
|
||||||
|
]`
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Runtime Health`,
|
||||||
|
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||||
|
`Export Directory`,
|
||||||
|
`Network`,
|
||||||
|
`NVIDIA/AMD Driver`,
|
||||||
|
`CUDA / ROCm`,
|
||||||
|
`Required Utilities`,
|
||||||
|
`Bee Services`,
|
||||||
|
`<td>CPU</td>`,
|
||||||
|
`<td>Memory</td>`,
|
||||||
|
`<td>Storage</td>`,
|
||||||
|
`<td>GPU</td>`,
|
||||||
|
`CUDA runtime is not ready for GPU SAT.`,
|
||||||
|
`Missing: nvidia-smi`,
|
||||||
|
`bee-nvidia=inactive`,
|
||||||
|
`cpu SAT: FAILED`,
|
||||||
|
`storage SAT: FAILED`,
|
||||||
|
`sat:nvidia`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
42
audit/internal/webui/stability.go
Normal file
42
audit/internal/webui/stability.go
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"runtime/debug"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
if !runRecoverable(name, fn) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if restartDelay > 0 {
|
||||||
|
time.Sleep(restartDelay)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func goRecoverOnce(name string, fn func()) {
|
||||||
|
go func() {
|
||||||
|
_ = runRecoverable(name, fn)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func runRecoverable(name string, fn func()) (panicked bool) {
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
panicked = true
|
||||||
|
slog.Error("recovered panic",
|
||||||
|
"component", name,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
fn()
|
||||||
|
return false
|
||||||
|
}
|
||||||
85
audit/internal/webui/task_page.go
Normal file
85
audit/internal/webui/task_page.go
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
task, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
snapshot := *task
|
||||||
|
body := renderTaskDetailPage(h.opts, snapshot)
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||||
|
title := task.Name
|
||||||
|
if strings.TrimSpace(title) == "" {
|
||||||
|
title = task.ID
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||||
|
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||||
|
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||||
|
body.WriteString(`</div>`)
|
||||||
|
|
||||||
|
if report := loadTaskReportFragment(task); report != "" {
|
||||||
|
body.WriteString(report)
|
||||||
|
} else {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
|
||||||
|
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
|
||||||
|
if strings.TrimSpace(task.ErrMsg) != "" {
|
||||||
|
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
body.WriteString(`<script>
|
||||||
|
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||||
|
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||||
|
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||||
|
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||||
|
_taskDetailES.addEventListener('done', function(){ _taskDetailES.close(); setTimeout(function(){ window.location.reload(); }, 1000); });
|
||||||
|
_taskDetailES.onerror = function(){ _taskDetailES.close(); };
|
||||||
|
</script>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
return layoutHead(opts.Title+" — "+title) +
|
||||||
|
layoutNav("tasks", opts.BuildLabel) +
|
||||||
|
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||||
|
body.String() +
|
||||||
|
`</div></div></body></html>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadTaskReportFragment(task Task) string {
|
||||||
|
if strings.TrimSpace(task.ReportHTMLPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return string(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArtifactDownloadLink(task Task, absPath string) string {
|
||||||
|
if strings.TrimSpace(absPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||||
|
}
|
||||||
286
audit/internal/webui/task_report.go
Normal file
286
audit/internal/webui/task_report.go
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
var taskReportMetricsDBPath = metricsDBPath
|
||||||
|
|
||||||
|
type taskReport struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Target string `json:"target"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
|
DurationSec int `json:"duration_sec,omitempty"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
LogFile string `json:"log_file,omitempty"`
|
||||||
|
Charts []taskReportChart `json:"charts,omitempty"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskReportChart struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
File string `json:"file"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskChartSpec struct {
|
||||||
|
Path string
|
||||||
|
File string
|
||||||
|
}
|
||||||
|
|
||||||
|
var taskDashboardChartSpecs = []taskChartSpec{
|
||||||
|
{Path: "server-load", File: "server-load.svg"},
|
||||||
|
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
|
||||||
|
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
|
||||||
|
{Path: "server-power", File: "server-power.svg"},
|
||||||
|
{Path: "server-fans", File: "server-fans.svg"},
|
||||||
|
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
|
||||||
|
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
|
||||||
|
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
|
||||||
|
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
|
||||||
|
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskReportArtifacts(t *Task) error {
|
||||||
|
if t == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
start, end := taskTimeWindow(t)
|
||||||
|
samples, _ := loadTaskMetricSamples(start, end)
|
||||||
|
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
|
||||||
|
|
||||||
|
logText := ""
|
||||||
|
if data, err := os.ReadFile(t.LogPath); err == nil {
|
||||||
|
logText = string(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
report := taskReport{
|
||||||
|
ID: t.ID,
|
||||||
|
Name: t.Name,
|
||||||
|
Target: t.Target,
|
||||||
|
Status: t.Status,
|
||||||
|
CreatedAt: t.CreatedAt,
|
||||||
|
StartedAt: t.StartedAt,
|
||||||
|
DoneAt: t.DoneAt,
|
||||||
|
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
|
||||||
|
Error: t.ErrMsg,
|
||||||
|
LogFile: filepath.Base(t.LogPath),
|
||||||
|
Charts: charts,
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func reportDoneTime(t *Task) time.Time {
|
||||||
|
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
return *t.DoneAt
|
||||||
|
}
|
||||||
|
return time.Now()
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskTimeWindow(t *Task) (time.Time, time.Time) {
|
||||||
|
if t == nil {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
return now, now
|
||||||
|
}
|
||||||
|
start := t.CreatedAt.UTC()
|
||||||
|
if t.StartedAt != nil && !t.StartedAt.IsZero() {
|
||||||
|
start = t.StartedAt.UTC()
|
||||||
|
}
|
||||||
|
end := time.Now().UTC()
|
||||||
|
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
end = t.DoneAt.UTC()
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
end = start
|
||||||
|
}
|
||||||
|
return start, end
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||||
|
db, err := openMetricsDB(taskReportMetricsDBPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
return db.LoadBetween(start, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||||
|
var charts []taskReportChart
|
||||||
|
inline := make(map[string]string)
|
||||||
|
for _, spec := range taskDashboardChartSpecs {
|
||||||
|
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||||
|
if !ok || len(svg) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := filepath.Join(dir, spec.File)
|
||||||
|
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||||
|
inline[spec.File] = string(svg)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, idx := range taskGPUIndices(samples) {
|
||||||
|
file := fmt.Sprintf("gpu-%d-overview.svg", idx)
|
||||||
|
svg, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
|
if err != nil || !ok || len(svg) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := filepath.Join(dir, file)
|
||||||
|
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
charts = append(charts, taskReportChart{Title: gpuDisplayLabel(idx) + " Overview", File: file})
|
||||||
|
inline[file] = string(svg)
|
||||||
|
}
|
||||||
|
return charts, inline
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||||
|
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||||
|
if !ok {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
buf, err := renderMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMin,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
return title, buf, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
|
||||||
|
seen := map[int]bool{}
|
||||||
|
var out []int
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if seen[g.GPUIndex] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[g.GPUIndex] = true
|
||||||
|
out = append(out, g.GPUIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Ints(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJSONFile(path string, v any) error {
|
||||||
|
data, err := json.MarshalIndent(v, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, data, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
|
||||||
|
b.WriteString(`<div class="grid2">`)
|
||||||
|
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
|
||||||
|
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
|
||||||
|
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
|
||||||
|
if strings.TrimSpace(report.Error) != "" {
|
||||||
|
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||||
|
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||||
|
b.WriteString(`</div></div></div>`)
|
||||||
|
|
||||||
|
if len(report.Charts) > 0 {
|
||||||
|
b.WriteString(`<div class="grid2">`)
|
||||||
|
for _, chart := range report.Charts {
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||||
|
b.WriteString(charts[chart.File])
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
} else {
|
||||||
|
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
|
||||||
|
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskStatusBadge(status string) string {
|
||||||
|
className := map[string]string{
|
||||||
|
TaskRunning: "badge-ok",
|
||||||
|
TaskPending: "badge-unknown",
|
||||||
|
TaskDone: "badge-ok",
|
||||||
|
TaskFailed: "badge-err",
|
||||||
|
TaskCancelled: "badge-unknown",
|
||||||
|
}[status]
|
||||||
|
if className == "" {
|
||||||
|
className = "badge-unknown"
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(status)
|
||||||
|
if label == "" {
|
||||||
|
label = "unknown"
|
||||||
|
}
|
||||||
|
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatTaskTime(ts *time.Time, fallback time.Time) string {
|
||||||
|
if ts != nil && !ts.IsZero() {
|
||||||
|
return ts.Local().Format("2006-01-02 15:04:05")
|
||||||
|
}
|
||||||
|
if !fallback.IsZero() {
|
||||||
|
return fallback.Local().Format("2006-01-02 15:04:05")
|
||||||
|
}
|
||||||
|
return "n/a"
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatTaskDuration(sec int) string {
|
||||||
|
if sec <= 0 {
|
||||||
|
return "n/a"
|
||||||
|
}
|
||||||
|
if sec < 60 {
|
||||||
|
return fmt.Sprintf("%ds", sec)
|
||||||
|
}
|
||||||
|
if sec < 3600 {
|
||||||
|
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||||
|
}
|
||||||
1197
audit/internal/webui/tasks.go
Normal file
1197
audit/internal/webui/tasks.go
Normal file
File diff suppressed because it is too large
Load Diff
595
audit/internal/webui/tasks_test.go
Normal file
595
audit/internal/webui/tasks_test.go
Normal file
@@ -0,0 +1,595 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
started := time.Now().Add(-time.Minute)
|
||||||
|
|
||||||
|
// A task that was pending (not yet started) must be re-queued on restart.
|
||||||
|
pendingTask := &Task{
|
||||||
|
ID: "task-pending",
|
||||||
|
Name: "Memory Burn-in",
|
||||||
|
Target: "memory-stress",
|
||||||
|
Priority: 2,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||||
|
params: taskParams{Duration: 300, BurnProfile: "smoke"},
|
||||||
|
}
|
||||||
|
// A task that was running when bee-web crashed must NOT be re-queued —
|
||||||
|
// its child processes (e.g. gpu-burn-worker) survive the restart in
|
||||||
|
// their own process groups and can't be cancelled retroactively.
|
||||||
|
runningTask := &Task{
|
||||||
|
ID: "task-running",
|
||||||
|
Name: "NVIDIA GPU Stress",
|
||||||
|
Target: "nvidia-stress",
|
||||||
|
Priority: 1,
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now().Add(-3 * time.Minute),
|
||||||
|
StartedAt: &started,
|
||||||
|
params: taskParams{Duration: 86400},
|
||||||
|
}
|
||||||
|
for _, task := range []*Task{pendingTask, runningTask} {
|
||||||
|
q.tasks = append(q.tasks, task)
|
||||||
|
q.assignTaskLogPathLocked(task)
|
||||||
|
}
|
||||||
|
q.persistLocked()
|
||||||
|
|
||||||
|
recovered := &taskQueue{
|
||||||
|
statePath: q.statePath,
|
||||||
|
logsDir: q.logsDir,
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
recovered.loadLocked()
|
||||||
|
|
||||||
|
if len(recovered.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(recovered.tasks))
|
||||||
|
}
|
||||||
|
|
||||||
|
byID := map[string]*Task{}
|
||||||
|
for i := range recovered.tasks {
|
||||||
|
byID[recovered.tasks[i].ID] = recovered.tasks[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pending task must be re-queued as pending with params intact.
|
||||||
|
p := byID["task-pending"]
|
||||||
|
if p == nil {
|
||||||
|
t.Fatal("task-pending not found")
|
||||||
|
}
|
||||||
|
if p.Status != TaskPending {
|
||||||
|
t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
|
||||||
|
}
|
||||||
|
if p.StartedAt != nil {
|
||||||
|
t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
|
||||||
|
}
|
||||||
|
if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
|
||||||
|
t.Fatalf("pending task: params=%+v", p.params)
|
||||||
|
}
|
||||||
|
if p.LogPath == "" {
|
||||||
|
t.Fatal("pending task: expected log path")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Running task must be marked failed, not re-queued, to prevent
|
||||||
|
// launching duplicate workers (e.g. a second set of gpu-burn-workers).
|
||||||
|
r := byID["task-running"]
|
||||||
|
if r == nil {
|
||||||
|
t.Fatal("task-running not found")
|
||||||
|
}
|
||||||
|
if r.Status != TaskFailed {
|
||||||
|
t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
|
||||||
|
}
|
||||||
|
if r.ErrMsg == "" {
|
||||||
|
t.Fatal("running task: expected non-empty error message")
|
||||||
|
}
|
||||||
|
if r.DoneAt == nil {
|
||||||
|
t.Fatal("running task: expected done_at to be set")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "task.log")
|
||||||
|
if err := os.WriteFile(path, []byte("line1\nline2\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
j := newTaskJobState(path)
|
||||||
|
existing, ch := j.subscribe()
|
||||||
|
if ch == nil {
|
||||||
|
t.Fatal("expected live subscription channel")
|
||||||
|
}
|
||||||
|
if len(existing) != 2 || existing[0] != "line1" || existing[1] != "line2" {
|
||||||
|
t.Fatalf("existing=%v", existing)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||||
|
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||||
|
q := &taskQueue{
|
||||||
|
tasks: []*Task{
|
||||||
|
{
|
||||||
|
ID: "old-running",
|
||||||
|
Name: "Old Running",
|
||||||
|
Status: TaskRunning,
|
||||||
|
Priority: 10,
|
||||||
|
CreatedAt: now.Add(-3 * time.Minute),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ID: "new-done",
|
||||||
|
Name: "New Done",
|
||||||
|
Status: TaskDone,
|
||||||
|
Priority: 0,
|
||||||
|
CreatedAt: now.Add(-1 * time.Minute),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ID: "mid-pending",
|
||||||
|
Name: "Mid Pending",
|
||||||
|
Status: TaskPending,
|
||||||
|
Priority: 1,
|
||||||
|
CreatedAt: now.Add(-2 * time.Minute),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := q.snapshot()
|
||||||
|
if len(got) != 3 {
|
||||||
|
t.Fatalf("snapshot len=%d want 3", len(got))
|
||||||
|
}
|
||||||
|
if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
|
||||||
|
t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
logPath := filepath.Join(dir, "task.log")
|
||||||
|
if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "done-1",
|
||||||
|
Name: "Done Task",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
LogPath: logPath,
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
|
||||||
|
req.SetPathValue("id", "done-1")
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h := &handler{}
|
||||||
|
h.handleAPITasksStream(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
|
||||||
|
t.Fatalf("body=%q", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "event: done\n") {
|
||||||
|
t.Fatalf("missing done event: %q", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "pending-1",
|
||||||
|
Name: "Pending Task",
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
|
||||||
|
req.SetPathValue("id", "pending-1")
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
h := &handler{}
|
||||||
|
h.handleAPITasksStream(rec, req)
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
deadline := time.Now().Add(2 * time.Second)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
|
||||||
|
cancel()
|
||||||
|
<-done
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
}
|
||||||
|
cancel()
|
||||||
|
<-done
|
||||||
|
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
db, err := openMetricsDB(metricsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
base := time.Now().UTC().Add(-45 * time.Second)
|
||||||
|
if err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base,
|
||||||
|
CPULoadPct: 42,
|
||||||
|
MemLoadPct: 35,
|
||||||
|
PowerW: 510,
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
_ = db.Close()
|
||||||
|
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
started := time.Now().UTC().Add(-90 * time.Second)
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: started.Add(-10 * time.Second),
|
||||||
|
StartedAt: &started,
|
||||||
|
}
|
||||||
|
q.assignTaskLogPathLocked(task)
|
||||||
|
appendJobLog(task.LogPath, "line-1")
|
||||||
|
|
||||||
|
job := newTaskJobState(task.LogPath)
|
||||||
|
job.finish("")
|
||||||
|
q.finalizeTaskRun(task, job)
|
||||||
|
|
||||||
|
if task.Status != TaskDone {
|
||||||
|
t.Fatalf("status=%q want %q", task.Status, TaskDone)
|
||||||
|
}
|
||||||
|
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
|
||||||
|
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(task.ReportJSONPath); err != nil {
|
||||||
|
t.Fatalf("report json: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
|
||||||
|
t.Fatalf("report html: %v", err)
|
||||||
|
}
|
||||||
|
var report taskReport
|
||||||
|
data, err := os.ReadFile(task.ReportJSONPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.json): %v", err)
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(data, &report); err != nil {
|
||||||
|
t.Fatalf("Unmarshal(report.json): %v", err)
|
||||||
|
}
|
||||||
|
if report.ID != task.ID || report.Status != TaskDone {
|
||||||
|
t.Fatalf("report=%+v", report)
|
||||||
|
}
|
||||||
|
if len(report.Charts) == 0 {
|
||||||
|
t.Fatalf("expected charts in report, got none")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveBurnPreset(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
profile string
|
||||||
|
want burnPreset
|
||||||
|
}{
|
||||||
|
{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
|
||||||
|
{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
|
||||||
|
{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
|
||||||
|
{profile: "", want: burnPreset{DurationSec: 5 * 60}},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||||
|
t.Fatalf("resolveBurnPreset(%q)=%+v want %+v", tc.profile, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
loader string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||||
|
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||||
|
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
|
||||||
|
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
|
||||||
|
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||||
|
blocked := make(chan struct{})
|
||||||
|
released := make(chan struct{})
|
||||||
|
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
close(blocked)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
close(released)
|
||||||
|
return "", ctx.Err()
|
||||||
|
case <-time.After(5 * time.Second):
|
||||||
|
close(released)
|
||||||
|
return "unexpected", nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{Duration: 60},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
j.cancel = cancel
|
||||||
|
tk.job = j
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return aRun(nil, ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
q.runTask(tk, j, ctx)
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
<-blocked
|
||||||
|
j.abort()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-released:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("task did not observe cancel")
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("runTask did not return after cancel")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||||
|
var gotDuration int
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-burn-1",
|
||||||
|
Name: "CPU Burn-in",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{BurnProfile: "smoke"},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
|
||||||
|
gotDuration = durationSec
|
||||||
|
return "/tmp/cpu-burn.tar.gz", nil
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotDuration != 5*60 {
|
||||||
|
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{ExportDir: dir},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "support-bundle-1",
|
||||||
|
Name: "Support Bundle",
|
||||||
|
Target: "support-bundle",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
var gotExportDir string
|
||||||
|
orig := buildSupportBundle
|
||||||
|
buildSupportBundle = func(exportDir string) (string, error) {
|
||||||
|
gotExportDir = exportDir
|
||||||
|
return filepath.Join(exportDir, "bundle.tar.gz"), nil
|
||||||
|
}
|
||||||
|
defer func() { buildSupportBundle = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotExportDir != dir {
|
||||||
|
t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
|
||||||
|
}
|
||||||
|
if j.err != "" {
|
||||||
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
|
||||||
|
t.Fatalf("lines=%v", j.lines)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||||
|
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||||
|
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||||
|
started := time.Time{}
|
||||||
|
task := &Task{
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: created,
|
||||||
|
StartedAt: &started,
|
||||||
|
}
|
||||||
|
if got := taskElapsedSec(task, now); got != 0 {
|
||||||
|
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
stale := created.Add(-24 * time.Hour)
|
||||||
|
task.StartedAt = &stale
|
||||||
|
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||||
|
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "install-1",
|
||||||
|
Name: "Install to Disk",
|
||||||
|
Target: "install",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{Device: "/dev/sda"},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
var gotDevice string
|
||||||
|
var gotLogPath string
|
||||||
|
orig := installCommand
|
||||||
|
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||||
|
gotDevice = device
|
||||||
|
gotLogPath = logPath
|
||||||
|
return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
|
||||||
|
}
|
||||||
|
defer func() { installCommand = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotDevice != "/dev/sda" {
|
||||||
|
t.Fatalf("device=%q want /dev/sda", gotDevice)
|
||||||
|
}
|
||||||
|
if gotLogPath == "" {
|
||||||
|
t.Fatal("expected install log path")
|
||||||
|
}
|
||||||
|
logs := strings.Join(j.lines, "\n")
|
||||||
|
if !strings.Contains(logs, "Install log: ") {
|
||||||
|
t.Fatalf("missing install log line: %v", j.lines)
|
||||||
|
}
|
||||||
|
if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
|
||||||
|
t.Fatalf("missing streamed output: %v", j.lines)
|
||||||
|
}
|
||||||
|
if j.err != "" {
|
||||||
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
kmsgWatcher: newKmsgWatcher(nil),
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-panic-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
panic("boom")
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.executeTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if tk.Status != TaskFailed {
|
||||||
|
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
|
||||||
|
}
|
||||||
|
if tk.DoneAt == nil {
|
||||||
|
t.Fatal("expected done_at to be set")
|
||||||
|
}
|
||||||
|
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
|
||||||
|
t.Fatalf("task error=%q", tk.ErrMsg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(j.err, "task panic: boom") {
|
||||||
|
t.Fatalf("job error=%q", j.err)
|
||||||
|
}
|
||||||
|
q.kmsgWatcher.mu.Lock()
|
||||||
|
activeCount := q.kmsgWatcher.activeCount
|
||||||
|
window := q.kmsgWatcher.window
|
||||||
|
q.kmsgWatcher.mu.Unlock()
|
||||||
|
if activeCount != 0 {
|
||||||
|
t.Fatalf("activeCount=%d want 0", activeCount)
|
||||||
|
}
|
||||||
|
if window != nil {
|
||||||
|
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||||
|
}
|
||||||
|
}
|
||||||
16
audit/scripts/resolve-version.sh
Executable file
16
audit/scripts/resolve-version.sh
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
|
|
||||||
|
case "${tag}" in
|
||||||
|
v*)
|
||||||
|
printf '%s\n' "${tag#v}"
|
||||||
|
;;
|
||||||
|
"")
|
||||||
|
printf 'dev\n'
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
printf '%s\n' "${tag}"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
2
bible
2
bible
Submodule bible updated: 688b87e98d...1d89a4918e
67
bible-local/architecture/charting.md
Normal file
67
bible-local/architecture/charting.md
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# Charting architecture
|
||||||
|
|
||||||
|
## Decision: one chart engine for all live metrics
|
||||||
|
|
||||||
|
**Engine:** `github.com/go-analyze/charts` (pure Go, no CGO, SVG output)
|
||||||
|
**Theme:** `grafana` (dark background, coloured lines)
|
||||||
|
|
||||||
|
All live metrics charts in the web UI are server-side SVG images served by Go
|
||||||
|
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||||
|
There is no client-side canvas or JS chart library.
|
||||||
|
|
||||||
|
## Rule: live charts must be visually uniform
|
||||||
|
|
||||||
|
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||||
|
changes to existing charts must keep the same rendering model and presentation
|
||||||
|
rules unless there is an explicit architectural decision to diverge.
|
||||||
|
|
||||||
|
Default expectations:
|
||||||
|
|
||||||
|
- same server-side SVG pipeline for all live metrics charts
|
||||||
|
- same refresh behaviour and failure handling in the browser
|
||||||
|
- same canvas size class and card layout
|
||||||
|
- same legend placement policy across charts
|
||||||
|
- same axis, title, and summary conventions
|
||||||
|
- no chart-specific visual exceptions added as a quick fix
|
||||||
|
|
||||||
|
Current default for live charts:
|
||||||
|
|
||||||
|
- legend below the plot area when a chart has 8 series or fewer
|
||||||
|
- legend hidden when a chart has more than 8 series
|
||||||
|
- 10 equal Y-axis steps across the chart height
|
||||||
|
- 1400 x 360 SVG canvas with legend
|
||||||
|
- 1400 x 288 SVG canvas without legend
|
||||||
|
- full-width card rendering in a single-column stack
|
||||||
|
|
||||||
|
If one chart needs a different layout or legend behaviour, treat that as a
|
||||||
|
design-level decision affecting the whole chart family, not as a local tweak to
|
||||||
|
just one endpoint.
|
||||||
|
|
||||||
|
### Why go-analyze/charts
|
||||||
|
|
||||||
|
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||||
|
- SVG output — crisp at any display resolution, full-width without pixelation
|
||||||
|
- Grafana theme matches the dark web UI colour scheme
|
||||||
|
- Active fork of the archived wcharczuk/go-chart
|
||||||
|
|
||||||
|
### SAT stress-test charts
|
||||||
|
|
||||||
|
The `drawGPUChartSVG` function in `platform/gpu_metrics.go` is a separate
|
||||||
|
self-contained SVG renderer used **only** for completed SAT run reports
|
||||||
|
(HTML export, burn-in summaries). It is not used for live metrics.
|
||||||
|
|
||||||
|
### Live metrics chart endpoints
|
||||||
|
|
||||||
|
| Path | Content |
|
||||||
|
|------|---------|
|
||||||
|
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||||
|
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||||
|
|
||||||
|
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||||
|
the legend is hidden. The page renders them at `width: 100%` in a
|
||||||
|
single-column layout so they always fill the viewport width.
|
||||||
|
|
||||||
|
### Ring buffers
|
||||||
|
|
||||||
|
Each metric is stored in a 120-sample ring buffer (2 minutes of history at 1 Hz).
|
||||||
|
Buffers are per-server or per-GPU and grow dynamically as new GPUs appear.
|
||||||
@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
|
|||||||
|
|
||||||
## Boot sequence (single ISO)
|
## Boot sequence (single ISO)
|
||||||
|
|
||||||
|
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
|
||||||
|
|
||||||
`systemd` boot order:
|
`systemd` boot order:
|
||||||
|
|
||||||
```
|
```
|
||||||
@@ -20,11 +22,12 @@ local-fs.target
|
|||||||
│ creates /dev/nvidia* nodes)
|
│ creates /dev/nvidia* nodes)
|
||||||
├── bee-audit.service (runs `bee audit` → /var/log/bee-audit.json,
|
├── bee-audit.service (runs `bee audit` → /var/log/bee-audit.json,
|
||||||
│ never blocks boot on partial collector failures)
|
│ never blocks boot on partial collector failures)
|
||||||
└── bee-web.service (runs `bee web` on :80,
|
├── bee-web.service (runs `bee web` on :80 — full interactive web UI)
|
||||||
reads the latest audit snapshot on each request)
|
└── bee-desktop.service (startx → openbox + chromium http://localhost/)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Critical invariants:**
|
**Critical invariants:**
|
||||||
|
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
|
||||||
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
|
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
|
||||||
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
|
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
|
||||||
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
|
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
|
||||||
@@ -41,18 +44,24 @@ Local-console behavior:
|
|||||||
```text
|
```text
|
||||||
tty1
|
tty1
|
||||||
└── live-config autologin → bee
|
└── live-config autologin → bee
|
||||||
└── /home/bee/.profile
|
└── /home/bee/.profile (prints web UI URLs)
|
||||||
└── exec menu
|
|
||||||
└── /usr/local/bin/bee-tui
|
display :0
|
||||||
└── sudo -n /usr/local/bin/bee tui --runtime livecd
|
└── bee-desktop.service (User=bee)
|
||||||
|
└── startx /usr/local/bin/bee-openbox-session -- :0
|
||||||
|
├── tint2 (taskbar)
|
||||||
|
├── chromium http://localhost/
|
||||||
|
└── openbox (WM)
|
||||||
```
|
```
|
||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
- local `tty1` lands in user `bee`, not directly in `root`
|
- local `tty1` lands in user `bee`, not directly in `root`
|
||||||
- `menu` must work without typing `sudo`
|
- `bee-desktop.service` starts X11 + openbox + Chromium automatically after `bee-web.service`
|
||||||
- TUI actions still run as `root` via `sudo -n`
|
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||||
- SSH is independent from the tty1 path
|
- SSH is independent from the desktop path
|
||||||
- serial console support is enabled for VM boot debugging
|
- serial console support is enabled for VM boot debugging
|
||||||
|
- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
|
||||||
|
- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
|
||||||
|
|
||||||
## ISO build sequence
|
## ISO build sequence
|
||||||
|
|
||||||
@@ -71,24 +80,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
|
|||||||
d. build kernel modules against Debian headers
|
d. build kernel modules against Debian headers
|
||||||
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
|
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
|
||||||
f. cache in `dist/nvidia-<version>-<kver>/`
|
f. cache in `dist/nvidia-<version>-<kver>/`
|
||||||
7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
7. `build-cublas.sh`:
|
||||||
8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||||
9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
|
b. verify packages against repo `Packages.gz`
|
||||||
10. write staged `/etc/bee-release` (versions + git commit)
|
c. extract headers for `bee-gpu-burn` worker build
|
||||||
11. patch staged `motd` with build metadata
|
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||||
12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
|
||||||
13. sync staged overlay into workdir `config/includes.chroot/`
|
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||||
14. run `lb config && lb build` inside the privileged builder container
|
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||||
|
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||||
|
12. write staged `/etc/bee-release` (versions + git commit)
|
||||||
|
13. patch staged `motd` with build metadata
|
||||||
|
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||||
|
15. sync staged overlay into workdir `config/includes.chroot/`
|
||||||
|
16. run `lb config && lb build` inside the privileged builder container
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Build host notes:
|
||||||
|
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
|
||||||
|
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
|
||||||
|
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
|
||||||
|
|
||||||
**Critical invariants:**
|
**Critical invariants:**
|
||||||
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
|
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
|
||||||
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||||
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||||
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||||
|
- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||||
|
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||||
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||||
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||||
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
|
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
|
||||||
|
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
|
||||||
|
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
|
||||||
|
|
||||||
## Post-boot smoke test
|
## Post-boot smoke test
|
||||||
|
|
||||||
@@ -104,7 +128,7 @@ Key checks: NVIDIA modules loaded, `nvidia-smi` sees all GPUs, lib symlinks pres
|
|||||||
systemd services running, audit completed with NVIDIA enrichment, LAN reachability.
|
systemd services running, audit completed with NVIDIA enrichment, LAN reachability.
|
||||||
|
|
||||||
Current validation state:
|
Current validation state:
|
||||||
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and TUI startup
|
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and Web UI startup
|
||||||
- real hardware validation is still required before treating the ISO as release-ready
|
- real hardware validation is still required before treating the ISO as release-ready
|
||||||
|
|
||||||
## Overlay mechanism
|
## Overlay mechanism
|
||||||
@@ -131,43 +155,31 @@ Current validation state:
|
|||||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||||
|
|
||||||
Acceptance flows:
|
Acceptance flows:
|
||||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
|
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
|
||||||
|
- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
|
||||||
- `bee sat memory` → `memtester` archive
|
- `bee sat memory` → `memtester` archive
|
||||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||||
|
- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||||
|
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||||
|
- Ada / Hopper: add `fp8`
|
||||||
|
- Blackwell+: add `fp4`
|
||||||
|
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||||
- Runtime overrides:
|
- Runtime overrides:
|
||||||
- `BEE_GPU_STRESS_SECONDS`
|
|
||||||
- `BEE_GPU_STRESS_SIZE_MB`
|
|
||||||
- `BEE_MEMTESTER_SIZE_MB`
|
- `BEE_MEMTESTER_SIZE_MB`
|
||||||
- `BEE_MEMTESTER_PASSES`
|
- `BEE_MEMTESTER_PASSES`
|
||||||
|
|
||||||
## NVIDIA SAT TUI flow (v1.0.0+)
|
## NVIDIA SAT Web UI flow
|
||||||
|
|
||||||
```
|
```
|
||||||
TUI: Acceptance tests → NVIDIA command pack
|
Web UI: Acceptance Tests page → Run Test button
|
||||||
1. screenNvidiaSATSetup
|
1. POST /api/sat/nvidia/run → returns job_id
|
||||||
a. enumerate GPUs via `nvidia-smi --query-gpu=index,name,memory.total`
|
2. GET /api/sat/stream?job_id=... (SSE) — streams stdout/stderr lines live
|
||||||
b. user selects duration preset: 10 min / 1 h / 8 h / 24 h
|
3. After completion — archive written to /appdata/bee/export/bee-sat/
|
||||||
c. user selects GPUs via checkboxes (all selected by default)
|
summary.txt contains overall_status (OK / FAILED) and per-job status values
|
||||||
d. memory size = max(selected GPU memory) — auto-detected, not exposed to user
|
|
||||||
2. Start → screenNvidiaSATRunning
|
|
||||||
a. CUDA_VISIBLE_DEVICES set to selected GPU indices
|
|
||||||
b. tea.Batch: SAT goroutine + tea.ExecProcess(nvtop) launched concurrently
|
|
||||||
c. nvtop occupies full terminal; SAT result queues in background
|
|
||||||
d. [o] reopen nvtop at any time; [a] abort (cancels context → kills bee-gpu-stress)
|
|
||||||
3. GPU metrics collection (during bee-gpu-stress)
|
|
||||||
- background goroutine polls `nvidia-smi` every second
|
|
||||||
- per-second rows: elapsed, GPU index, temp°C, usage%, power W, clock MHz
|
|
||||||
- outputs: gpu-metrics.csv, gpu-metrics.html (offline SVG chart), gpu-metrics-term.txt
|
|
||||||
4. After SAT completes
|
|
||||||
- result shown in screenOutput with terminal line-chart (gpu-metrics-term.txt)
|
|
||||||
- chart is asciigraph-style: box-drawing chars (╭╮╰╯─│), 4 series per GPU,
|
|
||||||
Y axis with ticks, ANSI colours (red=temp, blue=usage, green=power, yellow=clock)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Critical invariants:**
|
**Critical invariants:**
|
||||||
- `nvtop` must be in `iso/builder/config/package-lists/bee.list.chroot` (baked into ISO).
|
- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
|
||||||
- `bee-gpu-stress` uses `exec.CommandContext` — aborted on cancel.
|
|
||||||
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||||
- If `nvtop` is not found on PATH, SAT still runs without it (graceful degradation).
|
|
||||||
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||||
|
|||||||
@@ -21,13 +21,14 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||||
- Machine-readable health summary derived from collector verdicts
|
- Machine-readable health summary derived from collector verdicts
|
||||||
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||||
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
|
- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||||
|
- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||||
- Automatic boot audit with operator-facing local console and SSH access
|
- Automatic boot audit with operator-facing local console and SSH access
|
||||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||||
- SSH access (OpenSSH) always available for inspection and debugging
|
- SSH access (OpenSSH) always available for inspection and debugging
|
||||||
- Interactive Go TUI via `bee tui` for network setup, service management, and acceptance tests
|
- Full web UI via `bee web` on port 80: interactive control panel with live metrics, SAT tests, network config, service management, export, and tools
|
||||||
- Read-only web viewer via `bee web`, rendering the latest audit snapshot through the embedded Reanimator Chart
|
- Local operator desktop: openbox + Xorg + Chromium auto-opening `http://localhost/`
|
||||||
- Local `tty1` operator UX: `bee` autologin, `menu` auto-start, privileged actions via `sudo -n`
|
- Local `tty1` operator UX: `bee` autologin, openbox desktop auto-starts with Chromium on `http://localhost/`
|
||||||
|
|
||||||
## Network isolation — CRITICAL
|
## Network isolation — CRITICAL
|
||||||
|
|
||||||
@@ -69,15 +70,18 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
| SSH | OpenSSH server |
|
| SSH | OpenSSH server |
|
||||||
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||||
|
| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||||
| Builder | Debian 12 host/VM or Debian 12 container image |
|
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||||
|
|
||||||
## Operator UX
|
## Operator UX
|
||||||
|
|
||||||
- On the live ISO, `tty1` autologins as `bee`
|
- On the live ISO, `tty1` autologins as `bee`
|
||||||
- The login profile auto-runs `menu`, which enters the Go TUI
|
- `bee-desktop.service` starts X11 + openbox + Chromium on display `:0`
|
||||||
- The TUI itself executes privileged actions as `root` via `sudo -n`
|
- Chromium opens `http://localhost/` — the full web UI
|
||||||
- SSH remains available independently of the local console path
|
- SSH remains available independently of the local console path
|
||||||
|
- Remote operators can open `http://<ip>/` in any browser on the same LAN
|
||||||
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
|
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
|
||||||
|
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
|
||||||
|
|
||||||
## Runtime split
|
## Runtime split
|
||||||
|
|
||||||
@@ -85,6 +89,7 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
- Live-ISO-only responsibilities stay in `iso/` integration code
|
- Live-ISO-only responsibilities stay in `iso/` integration code
|
||||||
- Live ISO launches the Go CLI with `--runtime livecd`
|
- Live ISO launches the Go CLI with `--runtime livecd`
|
||||||
- Local/manual runs use `--runtime auto` or `--runtime local`
|
- Local/manual runs use `--runtime auto` or `--runtime local`
|
||||||
|
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
|
||||||
|
|
||||||
## Key paths
|
## Key paths
|
||||||
|
|
||||||
@@ -99,7 +104,10 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
||||||
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||||
| `iso/overlay/etc/profile.d/bee.sh` | `menu` helper + tty1 auto-start policy |
|
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||||
| `iso/overlay/home/bee/.profile` | `bee` shell profile for local console startup |
|
| `iso/overlay/home/bee/.profile` | `bee` shell profile (PATH only) |
|
||||||
|
| `iso/overlay/etc/systemd/system/bee-desktop.service` | starts X11 + openbox + chromium |
|
||||||
|
| `iso/overlay/usr/local/bin/bee-desktop` | startx wrapper for bee-desktop.service |
|
||||||
|
| `iso/overlay/usr/local/bin/bee-openbox-session` | xinitrc: tint2 + chromium + openbox |
|
||||||
| `dist/` | Build outputs (gitignored) |
|
| `dist/` | Build outputs (gitignored) |
|
||||||
| `iso/out/` | Downloaded ISO files (gitignored) |
|
| `iso/out/` | Downloaded ISO files (gitignored) |
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
|
|||||||
- Kernel modules and nvidia-smi come from a single verified source.
|
- Kernel modules and nvidia-smi come from a single verified source.
|
||||||
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
||||||
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
||||||
|
- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
|
||||||
|
- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
|
||||||
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
||||||
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
||||||
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
||||||
|
|||||||
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||||
|
|
||||||
|
**Date:** 2026-04-01
|
||||||
|
**Status:** resolved
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
|
||||||
|
The commit history shows several distinct attempts:
|
||||||
|
|
||||||
|
- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
|
||||||
|
- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
|
||||||
|
- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
|
||||||
|
- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
|
||||||
|
- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
|
||||||
|
- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
|
||||||
|
|
||||||
|
Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- `lb binary_memtest` does run and installs `memtest86+`
|
||||||
|
- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
|
||||||
|
- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
|
||||||
|
|
||||||
|
So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
|
||||||
|
|
||||||
|
Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- the build now completes successfully because memtest is non-blocking by default
|
||||||
|
- `lb binary_memtest` still runs and installs `memtest86+`
|
||||||
|
- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
|
||||||
|
- but it executes too early for its current target paths:
|
||||||
|
- `binary/boot/grub/grub.cfg` is still missing at hook time
|
||||||
|
- `binary/isolinux/live.cfg` is still missing at hook time
|
||||||
|
- memtest binaries are also still absent in `binary/boot/`
|
||||||
|
- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
|
||||||
|
- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
|
||||||
|
|
||||||
|
So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
|
||||||
|
|
||||||
|
Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||||
|
artifact dated 2026-04-01:
|
||||||
|
|
||||||
|
- the final ISO does contain `boot/memtest86+x64.bin`
|
||||||
|
- the final ISO does contain `boot/memtest86+x64.efi`
|
||||||
|
- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
|
||||||
|
and `isolinux/live.cfg`
|
||||||
|
- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
|
||||||
|
shipped ISO
|
||||||
|
- the regression was in the build-time validator/debug path in `build.sh`
|
||||||
|
|
||||||
|
Root cause of the false alarm:
|
||||||
|
|
||||||
|
- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
|
||||||
|
successfully listed/extracted members"
|
||||||
|
- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
|
||||||
|
observable output as "memtest content missing"
|
||||||
|
- this made a reader failure look identical to a missing memtest payload
|
||||||
|
- as a result, we re-entered the same memtest investigation loop even though
|
||||||
|
the real ISO was already correct
|
||||||
|
|
||||||
|
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||||
|
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||||
|
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||||
|
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||||
|
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||||
|
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||||
|
`iso_memtest_present` return code of `1` as fatal
|
||||||
|
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||||
|
the recovery design itself was wrong
|
||||||
|
|
||||||
|
## Known Failed Attempts
|
||||||
|
|
||||||
|
These approaches were already tried and should not be repeated blindly:
|
||||||
|
|
||||||
|
1. Built-in live-build memtest only.
|
||||||
|
Reason it failed:
|
||||||
|
- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
|
||||||
|
|
||||||
|
2. Fixing only the memtest file names for Debian Bookworm.
|
||||||
|
Reason it failed:
|
||||||
|
- correct file names alone do not make the files appear in the final ISO.
|
||||||
|
|
||||||
|
3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
|
||||||
|
Reason it failed:
|
||||||
|
- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
|
||||||
|
|
||||||
|
4. Fallback extraction from cached `memtest86+` `.deb`.
|
||||||
|
Reason it failed:
|
||||||
|
- this was explored already and was not enough to stabilize the final ISO path end-to-end.
|
||||||
|
|
||||||
|
5. Restoring explicit memtest menu entries in source bootloader templates only.
|
||||||
|
Reason it failed:
|
||||||
|
- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
|
||||||
|
|
||||||
|
6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
|
||||||
|
Reason it failed:
|
||||||
|
- the hook runs before those files exist, so the hook cannot patch them there.
|
||||||
|
|
||||||
|
## What This Means
|
||||||
|
|
||||||
|
When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
|
||||||
|
|
||||||
|
- do not assume the built-in memtest stage is sufficient
|
||||||
|
- do not assume `chroot/boot/` will contain memtest payloads
|
||||||
|
- do not assume source bootloader templates are the last writer of final ISO configs
|
||||||
|
- do not assume the current normal binary hook timing is late enough for final patching
|
||||||
|
|
||||||
|
Any future memtest fix must explicitly identify:
|
||||||
|
|
||||||
|
- where the memtest binaries are reliably available at build time
|
||||||
|
- which exact build stage writes the final bootloader configs that land in the ISO
|
||||||
|
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||||
|
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||||
|
the validator printed a memtest warning
|
||||||
|
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||||
|
context rather than accidentally tripping `set -e`
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
|
||||||
|
|
||||||
|
Project rules from now on:
|
||||||
|
|
||||||
|
- Do **not** trust `--memtest memtest86+` by itself.
|
||||||
|
- A memtest implementation is considered valid only if the produced ISO actually contains:
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- a GRUB menu entry
|
||||||
|
- an isolinux menu entry
|
||||||
|
- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
|
||||||
|
- a binary hook copying files into `binary/boot/`
|
||||||
|
- extraction from the cached `memtest86+` `.deb`
|
||||||
|
- another deterministic build-time copy step
|
||||||
|
- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
|
||||||
|
|
||||||
|
Current implementation direction:
|
||||||
|
|
||||||
|
- keep the live-build memtest stage enabled if it helps package acquisition
|
||||||
|
- do not rely on the current early `binary_hooks` timing for final patching
|
||||||
|
- prefer a post-`lb build` recovery step in `build.sh` that:
|
||||||
|
- patches the fully materialized `LB_DIR/binary` tree
|
||||||
|
- injects memtest binaries there
|
||||||
|
- ensures final bootloader entries there
|
||||||
|
- reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
|
||||||
|
- also treat ISO validation tooling as part of the critical path:
|
||||||
|
- install a stable ISO reader in the builder image
|
||||||
|
- fail with an explicit reader error if ISO listing/extraction fails
|
||||||
|
- do not treat reader failure as evidence that memtest is missing
|
||||||
|
- do not call a probe that may return "needs recovery" as a bare command under
|
||||||
|
`set -e`; wrap it in explicit control flow
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Future memtest changes must begin by reading this ADR and the commits listed above.
|
||||||
|
- Future memtest changes must also begin by reading the failed-attempt list above.
|
||||||
|
- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
|
||||||
|
- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
|
||||||
|
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||||
|
"missing memtest" warning without a successful ISO read is not evidence.
|
||||||
|
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||||
|
|
||||||
|
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||||
|
|
||||||
|
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||||
|
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||||
|
|
||||||
|
### Components
|
||||||
|
|
||||||
|
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||||
|
|
||||||
|
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||||
|
those files may not exist yet. Instead:
|
||||||
|
|
||||||
|
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||||
|
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||||
|
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||||
|
If they do not exist, the hook warns and continues (does not fail).
|
||||||
|
|
||||||
|
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||||
|
|
||||||
|
**2. Post-`lb build` recovery step in `build.sh`**
|
||||||
|
|
||||||
|
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||||
|
contains all required memtest artifacts. If not:
|
||||||
|
|
||||||
|
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||||
|
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||||
|
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||||
|
the ISO with the patched tree.
|
||||||
|
|
||||||
|
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||||
|
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||||
|
|
||||||
|
**3. ISO validation hardening**
|
||||||
|
|
||||||
|
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||||
|
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||||
|
handled — it does not abort the build prematurely.
|
||||||
|
|
||||||
|
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||||
|
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||||
|
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||||
|
|
||||||
|
### Why this works when earlier attempts did not
|
||||||
|
|
||||||
|
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||||
|
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||||
|
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||||
|
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||||
|
|
||||||
|
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||||
|
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||||
|
There is no ordering dependency to get wrong.
|
||||||
|
|
||||||
|
### Do not revert
|
||||||
|
|
||||||
|
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||||
|
live-build alone produces all four required artifacts:
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- memtest entry in `boot/grub/grub.cfg`
|
||||||
|
- memtest entry in `isolinux/live.cfg`
|
||||||
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
|||||||
| Date | Decision | Status |
|
| Date | Decision | Status |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||||
|
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||||
|
|||||||
62
bible-local/docs/iso-build-rules.md
Normal file
62
bible-local/docs/iso-build-rules.md
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# ISO Build Rules
|
||||||
|
|
||||||
|
## Verify package names before use
|
||||||
|
|
||||||
|
ISO builds take 30–60 minutes. A wrong package name wastes an entire build cycle.
|
||||||
|
|
||||||
|
**Rule: before adding any Debian package name to the ISO config, verify it exists and check its file list.**
|
||||||
|
|
||||||
|
Use one of:
|
||||||
|
- `https://packages.debian.org/bookworm/<package-name>` — existence + description
|
||||||
|
- `https://packages.debian.org/bookworm/amd64/<package-name>/filelist` — exact files installed
|
||||||
|
- `apt-cache show <package>` inside a Debian bookworm container
|
||||||
|
|
||||||
|
This applies to:
|
||||||
|
- `iso/builder/config/package-lists/*.list.chroot`
|
||||||
|
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||||
|
|
||||||
|
## Memtest rule
|
||||||
|
|
||||||
|
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||||
|
We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
|
||||||
|
ran, but the final ISO still lacked memtest binaries and menu entries.
|
||||||
|
|
||||||
|
For this project, memtest is accepted only when the produced ISO actually
|
||||||
|
contains all of the following:
|
||||||
|
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- a memtest entry in `boot/grub/grub.cfg`
|
||||||
|
- a memtest entry in `isolinux/live.cfg`
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- Keep explicit post-build memtest validation in `build.sh`.
|
||||||
|
- Treat ISO reader success as a separate prerequisite from memtest content.
|
||||||
|
If the reader cannot list or extract from the ISO, that is a validator
|
||||||
|
failure, not proof that memtest is missing.
|
||||||
|
- If built-in integration does not produce the artifacts above, use a
|
||||||
|
deterministic project-owned copy/extract step instead of hoping live-build
|
||||||
|
will "start working".
|
||||||
|
- Do not switch back to built-in-only memtest without fresh build evidence from
|
||||||
|
a real ISO.
|
||||||
|
- If you reference memtest files manually, verify the exact package file list
|
||||||
|
first for the target Debian release.
|
||||||
|
|
||||||
|
Known bad loops for this repository:
|
||||||
|
|
||||||
|
- Do not retry built-in-only memtest without new evidence. We already proved
|
||||||
|
that `lb binary_memtest` can run while the final ISO still has no memtest.
|
||||||
|
- Do not assume fixing memtest file names is enough. Correct names did not fix
|
||||||
|
the final artifact path.
|
||||||
|
- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
|
||||||
|
- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
|
||||||
|
bootloader configs.
|
||||||
|
- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
|
||||||
|
timing is late enough to patch final `binary/boot/grub/grub.cfg` or
|
||||||
|
`binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
|
||||||
|
present yet when the hook executed.
|
||||||
|
- Do not treat a validator warning as ground truth until you have confirmed the
|
||||||
|
ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
|
||||||
|
regression because the final ISO was correct but the validator produced a
|
||||||
|
false negative.
|
||||||
35
bible-local/docs/validate-vs-burn.md
Normal file
35
bible-local/docs/validate-vs-burn.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Validate vs Burn: Hardware Impact Policy
|
||||||
|
|
||||||
|
## Validate Tests (non-destructive)
|
||||||
|
|
||||||
|
Tests on the **Validate** page are purely diagnostic. They:
|
||||||
|
|
||||||
|
- **Do not write to disks** — no data is written to storage devices; SMART counters (power-on hours, load cycle count, reallocated sectors) are not incremented.
|
||||||
|
- **Do not run sustained high load** — commands complete quickly (seconds to minutes) and do not push hardware to thermal or electrical limits.
|
||||||
|
- **Do not increment hardware wear counters** — GPU memory ECC counters, NVMe wear leveling counters, and similar endurance metrics are unaffected.
|
||||||
|
- **Are safe to run repeatedly** — on new, production-bound, or already-deployed hardware without concern for reducing lifespan.
|
||||||
|
|
||||||
|
### What Validate tests actually do
|
||||||
|
|
||||||
|
| Test | What it runs |
|
||||||
|
|---|---|
|
||||||
|
| NVIDIA GPU | `nvidia-smi`, `dcgmi diag` (levels 1–4 read-only diagnostics) |
|
||||||
|
| Memory | `memtester` on a limited allocation; reads/writes to RAM only |
|
||||||
|
| Storage | `smartctl -a`, `nvme smart-log` — reads SMART data only |
|
||||||
|
| CPU | `stress-ng` for a bounded duration; CPU-only, no I/O |
|
||||||
|
| AMD GPU | `rocm-smi --showallinfo`, `dmidecode` — read-only queries |
|
||||||
|
|
||||||
|
## Burn Tests (hardware wear)
|
||||||
|
|
||||||
|
Tests on the **Burn** page run hardware at maximum or near-maximum load for extended durations. They:
|
||||||
|
|
||||||
|
- **Wear storage**: write-intensive patterns can reduce SSD endurance (P/E cycles).
|
||||||
|
- **Stress GPU memory**: extended ECC stress tests may surface latent defects but also exercise memory cells.
|
||||||
|
- **Accelerate thermal cycling**: repeated heat/cool cycles degrade solder joints and capacitors over time.
|
||||||
|
- **May increment wear counters**: GPU power-on hours, NVMe media wear indicator, and similar metrics will advance.
|
||||||
|
|
||||||
|
### Rule
|
||||||
|
|
||||||
|
> Run **Validate** freely on any server, at any time, before or after deployment.
|
||||||
|
> Run **Burn** only when explicitly required (e.g., initial acceptance after repair, or per customer SLA).
|
||||||
|
> Document when and why Burn tests were run.
|
||||||
Submodule internal/chart updated: 05db6994d4...ac8120c8ab
59
iso/README.md
Normal file
59
iso/README.md
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
# ISO Build
|
||||||
|
|
||||||
|
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Docker Desktop or another Docker-compatible container runtime
|
||||||
|
- Privileged containers enabled
|
||||||
|
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
|
||||||
|
|
||||||
|
## Build On macOS
|
||||||
|
|
||||||
|
From the repository root:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
The script defaults to `linux/amd64` builder containers, so it works on:
|
||||||
|
|
||||||
|
- Intel Mac
|
||||||
|
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
|
||||||
|
|
||||||
|
You do not need to pass `--platform` manually for normal ISO builds.
|
||||||
|
|
||||||
|
## Useful Options
|
||||||
|
|
||||||
|
Build with explicit SSH keys baked into the ISO:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||||
|
```
|
||||||
|
|
||||||
|
Rebuild the builder image:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh --rebuild-image
|
||||||
|
```
|
||||||
|
|
||||||
|
Use a custom cache directory:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||||
|
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||||
|
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||||
|
- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
|
||||||
|
- Override the container platform only if you know why:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
- The shipped ISO is still `amd64`.
|
||||||
|
- Output ISO artifacts are written under `dist/`.
|
||||||
@@ -17,15 +17,40 @@ RUN apt-get update -qq && apt-get install -y \
|
|||||||
wget \
|
wget \
|
||||||
curl \
|
curl \
|
||||||
tar \
|
tar \
|
||||||
|
libarchive-tools \
|
||||||
xz-utils \
|
xz-utils \
|
||||||
rsync \
|
rsync \
|
||||||
build-essential \
|
build-essential \
|
||||||
gcc \
|
gcc \
|
||||||
make \
|
make \
|
||||||
perl \
|
perl \
|
||||||
|
pkg-config \
|
||||||
|
yasm \
|
||||||
|
libssl-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
libbz2-dev \
|
||||||
|
libgmp-dev \
|
||||||
|
libpcap-dev \
|
||||||
|
libsqlite3-dev \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
ocl-icd-opencl-dev \
|
||||||
linux-headers-amd64 \
|
linux-headers-amd64 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests)
|
||||||
|
RUN wget -qO /tmp/cuda-keyring.gpg \
|
||||||
|
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \
|
||||||
|
&& gpg --dearmor < /tmp/cuda-keyring.gpg \
|
||||||
|
> /usr/share/keyrings/nvidia-cuda.gpg \
|
||||||
|
&& rm /tmp/cuda-keyring.gpg \
|
||||||
|
&& echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \
|
||||||
|
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
||||||
|
> /etc/apt/sources.list.d/cuda.list \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -y cuda-nvcc-12-8 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& ln -sfn /usr/local/cuda-12.8 /usr/local/cuda
|
||||||
|
|
||||||
RUN arch="$(dpkg --print-architecture)" \
|
RUN arch="$(dpkg --print-architecture)" \
|
||||||
&& case "$arch" in \
|
&& case "$arch" in \
|
||||||
amd64) goarch=amd64 ;; \
|
amd64) goarch=amd64 ;; \
|
||||||
|
|||||||
@@ -4,5 +4,20 @@ NVIDIA_DRIVER_VERSION=590.48.01
|
|||||||
NCCL_VERSION=2.28.9-1
|
NCCL_VERSION=2.28.9-1
|
||||||
NCCL_CUDA_VERSION=13.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
|
NCCL_TESTS_VERSION=2.13.10
|
||||||
|
NVCC_VERSION=12.8
|
||||||
|
CUBLAS_VERSION=13.0.2.14-1
|
||||||
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
|
DCGM_VERSION=4.5.3-1
|
||||||
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
|
ROCM_VERSION=6.3.4
|
||||||
|
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||||
|
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||||
|
ROCM_VALIDATION_SUITE_VERSION=1.1.0.60304-76~22.04
|
||||||
|
ROCBLAS_VERSION=4.3.0.60304-76~22.04
|
||||||
|
ROCRAND_VERSION=3.2.0.60304-76~22.04
|
||||||
|
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||||
|
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||||
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
|||||||
@@ -29,9 +29,10 @@ lb config noauto \
|
|||||||
--security true \
|
--security true \
|
||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
--memtest none \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY-BEE" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components console=ttyS0,115200n8 console=ttyS1,115200n8 loglevel=7 systemd.log_target=console systemd.journald.forward_to_console=1 systemd.journald.max_level_console=debug username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user