Compare commits
183 Commits
0c16616cc9
...
v4.6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 | ||
|
|
fc7fe0b08e | ||
|
|
3cf75a541a | ||
|
|
1f750d3edd | ||
|
|
b2b0444131 | ||
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 | ||
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac | |||
| 3dbc2184ef | |||
| 60cb8f889a | |||
| c9ee078622 | |||
| ea660500c9 | |||
| d43a9aeec7 | |||
|
|
f5622e351e | ||
|
|
a20806afc8 | ||
|
|
4f9b6b3bcd | ||
|
|
c850b39b01 | ||
|
|
6dee8f3509 | ||
|
|
20f834aa96 | ||
| 105d92df8b | |||
| f96b149875 | |||
| 5ee120158e | |||
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 | |||
| 4db89e9773 | |||
| 3fda18f708 | |||
| ea518abf30 | |||
| 744de588bb | |||
| a3ed9473a3 | |||
| a714c45f10 | |||
| 349e026cfa | |||
| 889fe1dc2f | |||
| befdbf3768 | |||
| ec6a0b292d | |||
| a03312c286 | |||
| e69e9109da | |||
| 413869809d | |||
| f9bd38572a | |||
| 662e3d2cdd | |||
| 126af96780 | |||
| ada15ac777 | |||
| dfb94f9ca6 | |||
| 5857805518 | |||
| 59a1d4b209 | |||
| 0dbfaf6121 | |||
| 5d72d48714 | |||
| 096b4a09ca | |||
| 5d42a92e4c | |||
| 3e54763367 | |||
| f91bce8661 | |||
| 585e6d7311 | |||
| 0a98ed8ae9 | |||
| 911745e4da | |||
| acfd2010d7 | |||
| e904c13790 | |||
| 24c5c72cee | |||
| 6ff0bcad56 | |||
| 4fef26000c | |||
| a393dcb731 | |||
| 9e55728053 | |||
| 4b8023c1cb | |||
| 4c8417d20a | |||
| 0755374dd2 | |||
| c70ae274fa | |||
| 23ad7ff534 | |||
| de130966f7 | |||
| c6fbfc8306 | |||
| 35ad1c74d9 | |||
| 4a02e74b17 | |||
| cd2853ad99 | |||
| 6caf771d6e | |||
| 14fa87b7d7 | |||
| 600ece911b | |||
| 2d424c63cb | |||
| 50f28d1ee6 | |||
| 3579747ae3 | |||
| 09dc7d2613 | |||
| ec0b7f7ff9 | |||
| e7a7ff54b9 | |||
| b4371e291e | |||
| c22b53a406 | |||
| ff0acc3698 | |||
| d50760e7c6 | |||
| ed4f8be019 | |||
| 883592d029 | |||
| a6dcaf1c7e | |||
| 88727fb590 | |||
| c9f5224c42 | |||
| 7cb5c02a9b | |||
| c1aa3cf491 | |||
| f7eb75c57c | |||
| 004cc4910d | |||
| ed1cceed8c | |||
| 9fe9f061f8 | |||
| 837a1fb981 | |||
| 1f43b4e050 | |||
| 83bbc8a1bc | |||
| 896bdb6ee8 | |||
| 5407c26e25 | |||
| 4fddaba9c5 | |||
| d2f384b6eb | |||
| 25f0f30aaf | |||
| a57b037a91 | |||
| 5644231f9a | |||
| eea98e6d76 | |||
| 967455194c | |||
| 79dabf3efb | |||
| 1336f5b95c | |||
| 31486a31c1 | |||
| aa3fc332ba | |||
| 62c57b87f2 | |||
| f600261546 | |||
| d7ca04bdfb | |||
| 5433652c70 | |||
| b25f014dbd | |||
| d69a46f211 | |||
|
|
fc5c2019aa | ||
|
|
67a215c66f | ||
|
|
8b4bfdf5ad | ||
|
|
0a52a4f3ba | ||
|
|
b132f7973a | ||
|
|
bd94b6c792 | ||
|
|
06017eddfd | ||
|
|
0ac7b6a963 | ||
|
|
3d2ae4cdcb | ||
|
|
4669f14f4f | ||
|
|
540a9e39b8 | ||
|
|
58510207fa | ||
|
|
4cd7c9ab4e | ||
|
|
cfe255f6e4 | ||
|
|
8b9d3447d7 | ||
|
|
614b7cad61 | ||
|
|
9a1df9b1ba | ||
|
|
30cf014d58 | ||
|
|
27d478aed6 | ||
|
|
d36e8442a9 | ||
|
|
b345b0d14d | ||
|
|
0a1ac2ab9f | ||
|
|
1e62f828c6 | ||
|
|
f8c997d272 |
4
PLAN.md
4
PLAN.md
@@ -343,9 +343,9 @@ Planned code shape:
|
||||
- `bee tui` can rerun the audit manually
|
||||
- `bee tui` can export the latest audit JSON to removable media
|
||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
||||
- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
|
||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||
|
||||
### 2.6 — Vendor utilities and optional assets
|
||||
|
||||
20
audit/Makefile
Normal file
20
audit/Makefile
Normal file
@@ -0,0 +1,20 @@
|
||||
LISTEN ?= :8080
|
||||
AUDIT_PATH ?=
|
||||
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||
|
||||
RUN_ARGS := web --listen $(LISTEN)
|
||||
ifneq ($(AUDIT_PATH),)
|
||||
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||
endif
|
||||
|
||||
.PHONY: run build test
|
||||
|
||||
run:
|
||||
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||
|
||||
build:
|
||||
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||
|
||||
test:
|
||||
go test ./...
|
||||
@@ -1,6 +1,7 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -11,12 +12,19 @@ import (
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"bee/audit/internal/tui"
|
||||
"bee/audit/internal/webui"
|
||||
)
|
||||
|
||||
var Version = "dev"
|
||||
|
||||
func buildLabel() string {
|
||||
label := strings.TrimSpace(Version)
|
||||
if label == "" {
|
||||
return "dev"
|
||||
}
|
||||
return label
|
||||
}
|
||||
|
||||
func main() {
|
||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||
}
|
||||
@@ -40,8 +48,6 @@ func run(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
case "audit":
|
||||
return runAudit(args[1:], stdout, stderr)
|
||||
case "tui":
|
||||
return runTUI(args[1:], stdout, stderr)
|
||||
case "export":
|
||||
return runExport(args[1:], stdout, stderr)
|
||||
case "preflight":
|
||||
@@ -66,7 +72,6 @@ func printRootUsage(w io.Writer) {
|
||||
fmt.Fprintln(w, `bee commands:
|
||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee tui --runtime auto|local|livecd
|
||||
bee export --target <device>
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
||||
@@ -79,8 +84,6 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
switch args[0] {
|
||||
case "audit":
|
||||
return runAudit([]string{"--help"}, stdout, stdout)
|
||||
case "tui":
|
||||
return runTUI([]string{"--help"}, stdout, stdout)
|
||||
case "export":
|
||||
return runExport([]string{"--help"}, stdout, stdout)
|
||||
case "preflight":
|
||||
@@ -145,43 +148,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runTUI(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("tui", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee tui [--runtime auto|local|livecd]")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
|
||||
if err != nil {
|
||||
slog.Error("resolve runtime", "err", err)
|
||||
return 1
|
||||
}
|
||||
|
||||
slog.SetDefault(slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
})))
|
||||
|
||||
application := app.New(platform.New())
|
||||
if err := tui.Run(application, runtimeInfo.Mode); err != nil {
|
||||
slog.Error("run tui", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runExport(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
@@ -333,10 +299,19 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
}
|
||||
|
||||
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||
if err != nil {
|
||||
slog.Warn("resolve runtime for web", "err", err)
|
||||
}
|
||||
|
||||
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
||||
Title: *title,
|
||||
AuditPath: *auditPath,
|
||||
ExportDir: *exportDir,
|
||||
Title: *title,
|
||||
BuildLabel: buildLabel(),
|
||||
AuditPath: *auditPath,
|
||||
ExportDir: *exportDir,
|
||||
App: app.New(platform.New()),
|
||||
RuntimeMode: runtimeInfo.Mode,
|
||||
}); err != nil {
|
||||
slog.Error("run web", "err", err)
|
||||
return 1
|
||||
@@ -357,6 +332,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
||||
if err := fs.Parse(args[1:]); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
@@ -371,7 +347,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
target := args[0]
|
||||
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
||||
return 2
|
||||
}
|
||||
|
||||
@@ -380,19 +356,25 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
switch target {
|
||||
case "nvidia":
|
||||
archive, err = application.RunNvidiaAcceptancePack("")
|
||||
level := *diagLevel
|
||||
if level > 0 {
|
||||
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
||||
} else {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
archive, err = application.RunMemoryAcceptancePack("")
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||
case "storage":
|
||||
archive, err = application.RunStorageAcceptancePack("")
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
archive, err = application.RunCPUAcceptancePack("", dur)
|
||||
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
|
||||
}
|
||||
if err != nil {
|
||||
slog.Error("run sat", "target", target, "err", err)
|
||||
|
||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunVersion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
old := Version
|
||||
Version = "test-version"
|
||||
t.Cleanup(func() { Version = old })
|
||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||
old := Version
|
||||
Version = "1.2.3"
|
||||
t.Cleanup(func() { Version = old })
|
||||
|
||||
if got := buildLabel(); got != "1.2.3" {
|
||||
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunExportRequiresTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
36
audit/go.mod
36
audit/go.mod
@@ -1,28 +1,26 @@
|
||||
module bee/audit
|
||||
|
||||
go 1.24.0
|
||||
go 1.25.0
|
||||
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require github.com/charmbracelet/bubbletea v1.3.4
|
||||
require github.com/charmbracelet/lipgloss v1.0.0
|
||||
require reanimator/chart v0.0.0
|
||||
require (
|
||||
github.com/go-analyze/charts v0.5.26
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
||||
github.com/charmbracelet/lipgloss v1.0.0 // promoted to direct — used for TUI colors
|
||||
github.com/charmbracelet/x/ansi v0.8.0 // indirect
|
||||
github.com/charmbracelet/x/term v0.2.1 // indirect
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/mattn/go-localereader v0.0.1 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.16 // indirect
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
|
||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
||||
github.com/muesli/termenv v0.15.2 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
golang.org/x/sync v0.11.0 // indirect
|
||||
golang.org/x/sys v0.30.0 // indirect
|
||||
golang.org/x/text v0.3.8 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/image v0.24.0 // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
modernc.org/sqlite v1.48.0 // indirect
|
||||
)
|
||||
|
||||
68
audit/go.sum
68
audit/go.sum
@@ -1,37 +1,37 @@
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
|
||||
github.com/charmbracelet/bubbletea v1.3.4 h1:kCg7B+jSCFPLYRA52SDZjr51kG/fMUEoPoZrkaDHyoI=
|
||||
github.com/charmbracelet/bubbletea v1.3.4/go.mod h1:dtcUCyCGEX3g9tosuYiut3MXgY/Jsv9nKVdibKKRRXo=
|
||||
github.com/charmbracelet/lipgloss v1.0.0 h1:O7VkGDvqEdGi93X+DeqsQ7PKHDgtQfF8j8/O2qFMQNg=
|
||||
github.com/charmbracelet/lipgloss v1.0.0/go.mod h1:U5fy9Z+C38obMs+T+tJqst9VGzlOYGj4ri9reL3qUlo=
|
||||
github.com/charmbracelet/x/ansi v0.8.0 h1:9GTq3xq9caJW8ZrBTe0LIe2fvfLR/bYXKTx2llXn7xE=
|
||||
github.com/charmbracelet/x/ansi v0.8.0/go.mod h1:wdYl/ONOLHLIVmQaxbIYEC/cRKOQyjTkowiI4blgS9Q=
|
||||
github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
|
||||
github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
|
||||
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
|
||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
|
||||
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
|
||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
||||
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
|
||||
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
|
||||
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
||||
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
|
||||
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
|
||||
@@ -33,12 +33,15 @@ var (
|
||||
)
|
||||
|
||||
type App struct {
|
||||
network networkManager
|
||||
services serviceManager
|
||||
exports exportManager
|
||||
tools toolManager
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
network networkManager
|
||||
services serviceManager
|
||||
exports exportManager
|
||||
tools toolManager
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -52,10 +55,15 @@ type networkManager interface {
|
||||
DHCPOne(iface string) (string, error)
|
||||
DHCPAll() (string, error)
|
||||
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
||||
SetInterfaceState(iface string, up bool) error
|
||||
GetInterfaceState(iface string) (bool, error)
|
||||
CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
|
||||
RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
|
||||
}
|
||||
|
||||
type serviceManager interface {
|
||||
ListBeeServices() ([]string, error)
|
||||
ServiceState(name string) string
|
||||
ServiceStatus(name string) (string, error)
|
||||
ServiceDo(name string, action platform.ServiceAction) (string, error)
|
||||
}
|
||||
@@ -70,16 +78,58 @@ type toolManager interface {
|
||||
CheckTools(names []string) []platform.ToolStatus
|
||||
}
|
||||
|
||||
type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
type GPUPresenceResult struct {
|
||||
Nvidia bool
|
||||
AMD bool
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUPresence() GPUPresenceResult {
|
||||
vendor := a.sat.DetectGPUVendor()
|
||||
return GPUPresenceResult{
|
||||
Nvidia: vendor == "nvidia",
|
||||
AMD: vendor == "amd",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error)
|
||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
||||
RunCPUAcceptancePack(baseDir string, durationSec int) (string, error)
|
||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(baseDir string) (string, error)
|
||||
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -88,14 +138,39 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
tools: platform,
|
||||
sat: platform,
|
||||
runtime: platform,
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
tools: platform,
|
||||
sat: platform,
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
snap, err := readAuditSnapshot(auditJSON)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return schema.HardwareIngestRequest{}, err
|
||||
}
|
||||
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
@@ -105,6 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -173,11 +249,20 @@ func (a *App) RuntimeHealthResult() ActionResult {
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
|
||||
}
|
||||
driverLabel := "Driver ready"
|
||||
accelLabel := "CUDA ready"
|
||||
switch a.sat.DetectGPUVendor() {
|
||||
case "amd":
|
||||
driverLabel = "AMDGPU ready"
|
||||
accelLabel = "ROCm SMI ready"
|
||||
case "nvidia":
|
||||
driverLabel = "NVIDIA ready"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
|
||||
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
|
||||
fmt.Fprintf(&body, "Driver ready: %t\n", health.DriverReady)
|
||||
fmt.Fprintf(&body, "CUDA ready: %t\n", health.CUDAReady)
|
||||
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
|
||||
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
|
||||
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
|
||||
if len(health.Issues) > 0 {
|
||||
body.WriteString("\n\nIssues:\n")
|
||||
@@ -211,6 +296,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -220,8 +308,11 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit exported."
|
||||
if path != "" {
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
@@ -238,9 +329,12 @@ func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, erro
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle exported."
|
||||
if path != "" {
|
||||
body = "Support bundle exported to " + path
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
@@ -275,6 +369,22 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return a.network.CaptureNetworkSnapshot()
|
||||
}
|
||||
|
||||
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
@@ -331,6 +441,10 @@ func (a *App) ListBeeServices() ([]string, error) {
|
||||
return a.services.ListBeeServices()
|
||||
}
|
||||
|
||||
func (a *App) ServiceState(name string) string {
|
||||
return a.services.ServiceState(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatus(name string) (string, error) {
|
||||
return a.services.ServiceStatus(name)
|
||||
}
|
||||
@@ -386,15 +500,15 @@ func (a *App) AuditLogTailResult() ActionResult {
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir)
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir)
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
@@ -406,58 +520,74 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (ActionResult, error) {
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, durationSec, sizeMB, gpuIndices)
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
// Include terminal chart if available (runDir = archive path without .tar.gz).
|
||||
if path != "" {
|
||||
termPath := filepath.Join(strings.TrimSuffix(path, ".tar.gz"), "gpu-metrics-term.txt")
|
||||
if chart, readErr := os.ReadFile(termPath); readErr == nil && len(chart) > 0 {
|
||||
body += "\n\n" + string(chart)
|
||||
}
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(baseDir)
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir)
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(baseDir, durationSec)
|
||||
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec)
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(baseDir)
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunStorageAcceptancePack(baseDir)
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
@@ -469,18 +599,140 @@ func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDAcceptancePack(baseDir)
|
||||
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir)
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
@@ -504,6 +756,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||
var body strings.Builder
|
||||
@@ -538,6 +791,7 @@ func (a *App) MainBanner() string {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ""
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
var lines []string
|
||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||
@@ -922,3 +1176,70 @@ func firstNonEmpty(values ...string) string {
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||
return a.installer.ListInstallDisks()
|
||||
}
|
||||
|
||||
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||
}
|
||||
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
@@ -40,6 +43,13 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
|
||||
return f.setStaticIPv4Fn(cfg)
|
||||
}
|
||||
|
||||
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
||||
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
||||
func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return platform.NetworkSnapshot{}, nil
|
||||
}
|
||||
func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
|
||||
|
||||
type fakeServices struct {
|
||||
serviceStatusFn func(string) (string, error)
|
||||
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
||||
@@ -49,6 +59,10 @@ func (f fakeServices) ListBeeServices() ([]string, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeServices) ServiceState(name string) string {
|
||||
return "active"
|
||||
}
|
||||
|
||||
func (f fakeServices) ServiceStatus(name string) (string, error) {
|
||||
return f.serviceStatusFn(name)
|
||||
}
|
||||
@@ -57,13 +71,22 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
|
||||
return f.serviceDoFn(name, action)
|
||||
}
|
||||
|
||||
type fakeExports struct{}
|
||||
type fakeExports struct {
|
||||
listTargetsFn func() ([]platform.RemovableTarget, error)
|
||||
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
if f.listTargetsFn != nil {
|
||||
return f.listTargetsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
|
||||
if f.exportToTargetFn != nil {
|
||||
return f.exportToTargetFn(src, target)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
@@ -97,44 +120,104 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
}
|
||||
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ int, _ []int) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaStressFn != nil {
|
||||
return f.runNvidiaStressFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
if f.listNvidiaGPUsFn != nil {
|
||||
return f.listNvidiaGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||
if f.runCPUFn != nil {
|
||||
return f.runCPUFn(baseDir, durationSec)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) DetectGPUVendor() string { return "" }
|
||||
func (f fakeSAT) DetectGPUVendor() string {
|
||||
if f.detectVendorFn != nil {
|
||||
return f.detectVendorFn()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { return nil, nil }
|
||||
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
if f.listAMDGPUsFn != nil {
|
||||
return f.listAMDGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDAcceptancePack(baseDir string) (string, error) { return "", nil }
|
||||
func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
if f.runAMDPackFn != nil {
|
||||
return f.runAMDPackFn(baseDir)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -394,6 +477,79 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
|
||||
if filepath.Base(src) == "" {
|
||||
t.Fatalf("expected non-empty source path")
|
||||
}
|
||||
return "/media/bee/" + filepath.Base(src), nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportSupportBundleResult error: %v", err)
|
||||
}
|
||||
if result.Title != "Export support bundle" {
|
||||
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
|
||||
}
|
||||
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
|
||||
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
|
||||
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
|
||||
if err == nil {
|
||||
t.Fatal("expected export error")
|
||||
}
|
||||
if contains(result.Body, "exported to") {
|
||||
t.Fatalf("body should not claim success:\n%s", result.Body)
|
||||
}
|
||||
if result.Body != "Support bundle export failed." {
|
||||
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -455,13 +611,13 @@ func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
if _, err := a.RunNvidiaAcceptancePack(""); err != nil {
|
||||
if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunMemoryAcceptancePack(""); err != nil {
|
||||
if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunStorageAcceptancePack(""); err != nil {
|
||||
if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
@@ -504,18 +660,58 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
raw := `{
|
||||
"collected_at": "2026-03-15T10:00:00Z",
|
||||
"hardware": {
|
||||
"board": {"serial_number": "SRV123"},
|
||||
"storage": [
|
||||
{"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
|
||||
{"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
|
||||
],
|
||||
"pcie_devices": [
|
||||
{"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
|
||||
{"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
|
||||
]
|
||||
}
|
||||
}`
|
||||
|
||||
got, err := ApplySATOverlay([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("ApplySATOverlay error: %v", err)
|
||||
}
|
||||
text := string(got)
|
||||
if contains(text, "Virtual HDisk0") {
|
||||
t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
|
||||
}
|
||||
if contains(text, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
|
||||
}
|
||||
if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
|
||||
t.Fatalf("overlaid audit should keep real devices:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
exportDir := filepath.Join(tmp, "export")
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
archive, err := BuildSupportBundle(exportDir)
|
||||
if err != nil {
|
||||
@@ -524,6 +720,78 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if _, err := os.Stat(archive); err != nil {
|
||||
t.Fatalf("archive stat: %v", err)
|
||||
}
|
||||
|
||||
file, err := os.Open(archive)
|
||||
if err != nil {
|
||||
t.Fatalf("open archive: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gzr, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
t.Fatalf("gzip reader: %v", err)
|
||||
}
|
||||
defer gzr.Close()
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
if contains(hdr.Name, "/export/bee-audit.json") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read audit entry: %v", err)
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
"/system/ip-link.txt",
|
||||
"/system/ip-link-stats.txt",
|
||||
"/system/ethtool-info.txt",
|
||||
"/system/ethtool-link.txt",
|
||||
"/system/ethtool-module.txt",
|
||||
"/system/mstflint-query.txt",
|
||||
} {
|
||||
var found bool
|
||||
for _, name := range names {
|
||||
if contains(name, want) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||
}
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
for _, name := range names {
|
||||
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||
foundRaw = true
|
||||
}
|
||||
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
|
||||
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
|
||||
}
|
||||
}
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
@@ -537,6 +805,10 @@ func TestMainBanner(t *testing.T) {
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
memorySerialA := "DIMM-A"
|
||||
memorySerialB := "DIMM-B"
|
||||
storageSerialA := "DISK-A"
|
||||
storageSerialB := "DISK-B"
|
||||
gpuClass := "VideoController"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
@@ -552,12 +824,12 @@ func TestMainBanner(t *testing.T) {
|
||||
{Model: &cpuModel},
|
||||
},
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
@@ -600,6 +872,44 @@ func TestMainBanner(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldRuntimePath := DefaultRuntimeJSONPath
|
||||
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
|
||||
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
|
||||
|
||||
raw, err := json.Marshal(schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: "/appdata/bee/export",
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
NetworkStatus: "OK",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal runtime health: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
|
||||
t.Fatalf("write runtime health: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
detectVendorFn: func() string { return "amd" },
|
||||
},
|
||||
}
|
||||
|
||||
result := a.RuntimeHealthResult()
|
||||
if !contains(result.Body, "AMDGPU ready: true") {
|
||||
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
|
||||
}
|
||||
if !contains(result.Body, "ROCm SMI ready: true") {
|
||||
t.Fatalf("body missing ROCm label:\n%s", result.Body)
|
||||
}
|
||||
if contains(result.Body, "CUDA ready") {
|
||||
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func intPtr(v int) *int { return &v }
|
||||
|
||||
func contains(haystack, needle string) bool {
|
||||
|
||||
266
audit/internal/app/component_status_db.go
Normal file
266
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,266 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
@@ -1,387 +0,0 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// ComponentRow is one line in the hardware panel.
|
||||
type ComponentRow struct {
|
||||
Key string // "CPU", "MEM", "GPU", "DISK", "PSU"
|
||||
Status string // "PASS", "FAIL", "CANCEL", "N/A"
|
||||
Detail string // compact one-liner
|
||||
}
|
||||
|
||||
// HardwarePanelData holds everything the TUI right panel needs.
|
||||
type HardwarePanelData struct {
|
||||
Header []string
|
||||
Rows []ComponentRow
|
||||
}
|
||||
|
||||
// LoadHardwarePanel reads the latest audit JSON and SAT summaries.
|
||||
// Returns empty panel if no audit data exists yet.
|
||||
func (a *App) LoadHardwarePanel() HardwarePanelData {
|
||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
if err != nil {
|
||||
return HardwarePanelData{Header: []string{"No audit data — run audit first."}}
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(raw, &snap); err != nil {
|
||||
return HardwarePanelData{Header: []string{"Audit data unreadable."}}
|
||||
}
|
||||
|
||||
statuses := satStatuses()
|
||||
|
||||
var header []string
|
||||
if sys := formatSystemLine(snap.Hardware.Board); sys != "" {
|
||||
header = append(header, sys)
|
||||
}
|
||||
for _, fw := range snap.Hardware.Firmware {
|
||||
if fw.DeviceName == "BIOS" && fw.Version != "" {
|
||||
header = append(header, "BIOS: "+fw.Version)
|
||||
}
|
||||
if fw.DeviceName == "BMC" && fw.Version != "" {
|
||||
header = append(header, "BMC: "+fw.Version)
|
||||
}
|
||||
}
|
||||
if ip := formatIPLine(a.network.ListInterfaces); ip != "" {
|
||||
header = append(header, ip)
|
||||
}
|
||||
|
||||
var rows []ComponentRow
|
||||
|
||||
if cpu := formatCPULine(snap.Hardware.CPUs); cpu != "" {
|
||||
rows = append(rows, ComponentRow{
|
||||
Key: "CPU",
|
||||
Status: statuses["cpu"],
|
||||
Detail: strings.TrimPrefix(cpu, "CPU: "),
|
||||
})
|
||||
}
|
||||
if mem := formatMemoryLine(snap.Hardware.Memory); mem != "" {
|
||||
rows = append(rows, ComponentRow{
|
||||
Key: "MEM",
|
||||
Status: statuses["memory"],
|
||||
Detail: strings.TrimPrefix(mem, "Memory: "),
|
||||
})
|
||||
}
|
||||
if gpu := formatGPULine(snap.Hardware.PCIeDevices); gpu != "" {
|
||||
rows = append(rows, ComponentRow{
|
||||
Key: "GPU",
|
||||
Status: statuses["gpu"],
|
||||
Detail: strings.TrimPrefix(gpu, "GPU: "),
|
||||
})
|
||||
}
|
||||
if disk := formatStorageLine(snap.Hardware.Storage); disk != "" {
|
||||
rows = append(rows, ComponentRow{
|
||||
Key: "DISK",
|
||||
Status: statuses["storage"],
|
||||
Detail: strings.TrimPrefix(disk, "Storage: "),
|
||||
})
|
||||
}
|
||||
if psu := formatPSULine(snap.Hardware.PowerSupplies); psu != "" {
|
||||
rows = append(rows, ComponentRow{
|
||||
Key: "PSU",
|
||||
Status: "N/A",
|
||||
Detail: psu,
|
||||
})
|
||||
}
|
||||
|
||||
return HardwarePanelData{Header: header, Rows: rows}
|
||||
}
|
||||
|
||||
// ComponentDetailResult returns detail text for a component shown in the panel.
|
||||
func (a *App) ComponentDetailResult(key string) ActionResult {
|
||||
switch key {
|
||||
case "CPU":
|
||||
return a.cpuDetailResult(false)
|
||||
case "MEM":
|
||||
return a.satDetailResult("memory", "memory-", "MEM detail")
|
||||
case "GPU":
|
||||
// Prefer whichever GPU SAT was run most recently.
|
||||
nv, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-nvidia-*/summary.txt"))
|
||||
am, _ := filepath.Glob(filepath.Join(DefaultSATBaseDir, "gpu-amd-*/summary.txt"))
|
||||
sort.Strings(nv)
|
||||
sort.Strings(am)
|
||||
latestNV := ""
|
||||
if len(nv) > 0 {
|
||||
latestNV = nv[len(nv)-1]
|
||||
}
|
||||
latestAM := ""
|
||||
if len(am) > 0 {
|
||||
latestAM = am[len(am)-1]
|
||||
}
|
||||
if latestAM > latestNV {
|
||||
return a.satDetailResult("gpu", "gpu-amd-", "GPU detail")
|
||||
}
|
||||
return a.satDetailResult("gpu", "gpu-nvidia-", "GPU detail")
|
||||
case "DISK":
|
||||
return a.satDetailResult("storage", "storage-", "DISK detail")
|
||||
case "PSU":
|
||||
return a.psuDetailResult()
|
||||
default:
|
||||
return ActionResult{Title: key, Body: "No detail available."}
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) cpuDetailResult(satOnly bool) ActionResult {
|
||||
var b strings.Builder
|
||||
|
||||
// Show latest SAT summary if available.
|
||||
satResult := a.satDetailResult("cpu", "cpu-", "CPU SAT")
|
||||
if satResult.Body != "No test results found. Run a test first." {
|
||||
fmt.Fprintln(&b, "=== Last SAT ===")
|
||||
fmt.Fprintln(&b, satResult.Body)
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
|
||||
if satOnly {
|
||||
body := strings.TrimSpace(b.String())
|
||||
if body == "" {
|
||||
body = "No CPU SAT results found. Run a test first."
|
||||
}
|
||||
return ActionResult{Title: "CPU SAT", Body: body}
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
if err != nil {
|
||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(raw, &snap); err != nil {
|
||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||
}
|
||||
if len(snap.Hardware.CPUs) == 0 {
|
||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||
}
|
||||
fmt.Fprintln(&b, "=== Audit ===")
|
||||
for i, cpu := range snap.Hardware.CPUs {
|
||||
fmt.Fprintf(&b, "CPU %d\n", i)
|
||||
if cpu.Model != nil {
|
||||
fmt.Fprintf(&b, " Model: %s\n", *cpu.Model)
|
||||
}
|
||||
if cpu.Manufacturer != nil {
|
||||
fmt.Fprintf(&b, " Vendor: %s\n", *cpu.Manufacturer)
|
||||
}
|
||||
if cpu.Cores != nil {
|
||||
fmt.Fprintf(&b, " Cores: %d\n", *cpu.Cores)
|
||||
}
|
||||
if cpu.Threads != nil {
|
||||
fmt.Fprintf(&b, " Threads: %d\n", *cpu.Threads)
|
||||
}
|
||||
if cpu.MaxFrequencyMHz != nil {
|
||||
fmt.Fprintf(&b, " Max freq: %d MHz\n", *cpu.MaxFrequencyMHz)
|
||||
}
|
||||
if cpu.TemperatureC != nil {
|
||||
fmt.Fprintf(&b, " Temp: %.1f°C\n", *cpu.TemperatureC)
|
||||
}
|
||||
if cpu.Throttled != nil {
|
||||
fmt.Fprintf(&b, " Throttled: %v\n", *cpu.Throttled)
|
||||
}
|
||||
if cpu.CorrectableErrorCount != nil && *cpu.CorrectableErrorCount > 0 {
|
||||
fmt.Fprintf(&b, " ECC correctable: %d\n", *cpu.CorrectableErrorCount)
|
||||
}
|
||||
if cpu.UncorrectableErrorCount != nil && *cpu.UncorrectableErrorCount > 0 {
|
||||
fmt.Fprintf(&b, " ECC uncorrectable: %d\n", *cpu.UncorrectableErrorCount)
|
||||
}
|
||||
if i < len(snap.Hardware.CPUs)-1 {
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
return ActionResult{Title: "CPU", Body: strings.TrimSpace(b.String())}
|
||||
}
|
||||
|
||||
func (a *App) satDetailResult(statusKey, prefix, title string) ActionResult {
|
||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return ActionResult{Title: title, Body: "No test results found. Run a test first."}
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
return ActionResult{Title: title, Body: "Could not read test results."}
|
||||
}
|
||||
return ActionResult{Title: title, Body: formatSATDetail(strings.TrimSpace(string(raw)))}
|
||||
}
|
||||
|
||||
// formatSATDetail converts raw summary.txt key=value content to a human-readable per-step display.
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
// Collect step names in order they appear in the file
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// cleanSummaryKey strips the leading numeric prefix from a SAT step key.
|
||||
// "1-lscpu" → "lscpu", "3-stress-ng" → "stress-ng"
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
func (a *App) psuDetailResult() ActionResult {
|
||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
if err != nil {
|
||||
return ActionResult{Title: "PSU", Body: "No audit data."}
|
||||
}
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(raw, &snap); err != nil {
|
||||
return ActionResult{Title: "PSU", Body: "Audit data unreadable."}
|
||||
}
|
||||
if len(snap.Hardware.PowerSupplies) == 0 {
|
||||
return ActionResult{Title: "PSU", Body: "No PSU data in last audit."}
|
||||
}
|
||||
var b strings.Builder
|
||||
for i, psu := range snap.Hardware.PowerSupplies {
|
||||
fmt.Fprintf(&b, "PSU %d\n", i)
|
||||
if psu.Model != nil {
|
||||
fmt.Fprintf(&b, " Model: %s\n", *psu.Model)
|
||||
}
|
||||
if psu.Vendor != nil {
|
||||
fmt.Fprintf(&b, " Vendor: %s\n", *psu.Vendor)
|
||||
}
|
||||
if psu.WattageW != nil {
|
||||
fmt.Fprintf(&b, " Rated: %d W\n", *psu.WattageW)
|
||||
}
|
||||
if psu.InputPowerW != nil {
|
||||
fmt.Fprintf(&b, " Input: %.1f W\n", *psu.InputPowerW)
|
||||
}
|
||||
if psu.OutputPowerW != nil {
|
||||
fmt.Fprintf(&b, " Output: %.1f W\n", *psu.OutputPowerW)
|
||||
}
|
||||
if psu.TemperatureC != nil {
|
||||
fmt.Fprintf(&b, " Temp: %.1f°C\n", *psu.TemperatureC)
|
||||
}
|
||||
if i < len(snap.Hardware.PowerSupplies)-1 {
|
||||
fmt.Fprintln(&b)
|
||||
}
|
||||
}
|
||||
return ActionResult{Title: "PSU", Body: strings.TrimSpace(b.String())}
|
||||
}
|
||||
|
||||
// satStatuses reads the latest summary.txt for each SAT type and returns
|
||||
// a map of component key ("gpu","memory","storage") → status ("PASS","FAIL","CANCEL","N/A").
|
||||
func satStatuses() map[string]string {
|
||||
result := map[string]string{
|
||||
"gpu": "N/A",
|
||||
"memory": "N/A",
|
||||
"storage": "N/A",
|
||||
"cpu": "N/A",
|
||||
}
|
||||
patterns := []struct {
|
||||
key string
|
||||
prefix string
|
||||
}{
|
||||
{"gpu", "gpu-nvidia-"},
|
||||
{"gpu", "gpu-amd-"},
|
||||
{"memory", "memory-"},
|
||||
{"storage", "storage-"},
|
||||
{"cpu", "cpu-"},
|
||||
}
|
||||
for _, item := range patterns {
|
||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
values := parseKeyValueSummary(string(raw))
|
||||
switch strings.ToUpper(strings.TrimSpace(values["overall_status"])) {
|
||||
case "OK":
|
||||
result[item.key] = "PASS"
|
||||
case "FAILED":
|
||||
result[item.key] = "FAIL"
|
||||
case "CANCELED", "CANCELLED":
|
||||
result[item.key] = "CANCEL"
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func formatPSULine(psus []schema.HardwarePowerSupply) string {
|
||||
var present []schema.HardwarePowerSupply
|
||||
for _, psu := range psus {
|
||||
if psu.Present != nil && !*psu.Present {
|
||||
continue
|
||||
}
|
||||
present = append(present, psu)
|
||||
}
|
||||
if len(present) == 0 {
|
||||
return ""
|
||||
}
|
||||
firstW := 0
|
||||
if present[0].WattageW != nil {
|
||||
firstW = *present[0].WattageW
|
||||
}
|
||||
allSame := firstW > 0
|
||||
for _, p := range present[1:] {
|
||||
w := 0
|
||||
if p.WattageW != nil {
|
||||
w = *p.WattageW
|
||||
}
|
||||
if w != firstW {
|
||||
allSame = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allSame && firstW > 0 {
|
||||
return fmt.Sprintf("%dx %dW", len(present), firstW)
|
||||
}
|
||||
return fmt.Sprintf("%d PSU", len(present))
|
||||
}
|
||||
300
audit/internal/app/sat_overlay.go
Normal file
300
audit/internal/app/sat_overlay.go
Normal file
@@ -0,0 +1,300 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "amd", summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok {
|
||||
applyCPUSAT(snap.CPUs, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
runAtUTC string
|
||||
overall string
|
||||
kv map[string]string
|
||||
}
|
||||
|
||||
func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return satSummary{}, false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
return satSummary{}, false
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
return satSummary{
|
||||
runAtUTC: strings.TrimSpace(kv["run_at_utc"]),
|
||||
overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])),
|
||||
kv: kv,
|
||||
}, true
|
||||
}
|
||||
|
||||
func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if !matchesGPUVendor(devs[i], vendor) {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "memory SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range dimms {
|
||||
mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "CPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range cpus {
|
||||
mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) {
|
||||
byDevice := parseStorageSATStatus(summary)
|
||||
for i := range disks {
|
||||
devPath, _ := disks[i].Telemetry["linux_device"].(string)
|
||||
devName := filepath.Base(strings.TrimSpace(devPath))
|
||||
if devName == "" {
|
||||
continue
|
||||
}
|
||||
result, ok := byDevice[devName]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description)
|
||||
}
|
||||
}
|
||||
|
||||
type satStatusResult struct {
|
||||
status string
|
||||
description string
|
||||
ok bool
|
||||
}
|
||||
|
||||
func parseStorageSATStatus(summary satSummary) map[string]satStatusResult {
|
||||
result := map[string]satStatusResult{}
|
||||
for key, value := range summary.kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
step := strings.ReplaceAll(base[idx+1:], "_", "-")
|
||||
stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
current := result[devName]
|
||||
if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) {
|
||||
result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
||||
return satKeyStatus(summary.overall, label)
|
||||
}
|
||||
|
||||
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||
case "OK":
|
||||
// No error description on success — error_description is for problems only.
|
||||
return "OK", "", true
|
||||
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||
// Tool couldn't run or test was incomplete — we can't assert hardware health.
|
||||
return "Unknown", "", true
|
||||
case "FAILED":
|
||||
return "Critical", label + " failed", true
|
||||
default:
|
||||
return "", "", false
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
}
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
case "Unknown":
|
||||
return 1 // same as OK — does not override OK from another source
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
}
|
||||
return *v
|
||||
}
|
||||
|
||||
func appStringPtr(value string) *string {
|
||||
return &value
|
||||
}
|
||||
61
audit/internal/app/sat_overlay_test.go
Normal file
61
audit/internal/app/sat_overlay_test.go
Normal file
@@ -0,0 +1,61 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "storage-20260325-161151")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:11:51Z\nnvme0n1_nvme_smart_log_status=OK\nsda_smartctl_health_status=FAILED\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
nvme := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/nvme0n1"}}
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
}
|
||||
if snap.Storage[1].Status == nil || *snap.Storage[1].Status != "Critical" {
|
||||
t.Fatalf("sda status=%v want Critical", snap.Storage[1].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-amd-20260325-161436")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:14:36Z\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
@@ -27,15 +27,118 @@ var supportBundleCommands = []struct {
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -i "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -m "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v mstflint >/dev/null 2>&1; then
|
||||
echo "mstflint not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/bus/pci/devices/*; do
|
||||
[ -e "$path/vendor" ] || continue
|
||||
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x15b3" ] || continue
|
||||
bdf=$(basename "$path")
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
mstflint -d "$bdf" q 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no Mellanox/NVIDIA networking devices found"
|
||||
fi
|
||||
`}},
|
||||
}
|
||||
|
||||
var supportBundleOptionalFiles = []struct {
|
||||
name string
|
||||
src string
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
@@ -56,7 +159,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
}
|
||||
defer os.RemoveAll(stageRoot)
|
||||
|
||||
if err := copyDirContents(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
|
||||
@@ -75,6 +178,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -86,34 +192,64 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
|
||||
func cleanupOldSupportBundles(dir string) error {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
type entry struct {
|
||||
path string
|
||||
mod time.Time
|
||||
entries := supportBundleEntries(matches)
|
||||
for path, mod := range entries {
|
||||
if time.Since(mod) > 24*time.Hour {
|
||||
_ = os.Remove(path)
|
||||
delete(entries, path)
|
||||
}
|
||||
}
|
||||
list := make([]entry, 0, len(matches))
|
||||
ordered := orderSupportBundles(entries)
|
||||
if len(ordered) > 3 {
|
||||
for _, old := range ordered[3:] {
|
||||
_ = os.Remove(old)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func latestSupportBundlePath(dir string) (string, error) {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||
if len(ordered) == 0 {
|
||||
return "", os.ErrNotExist
|
||||
}
|
||||
return ordered[0], nil
|
||||
}
|
||||
|
||||
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||
entries := make(map[string]time.Time, len(matches))
|
||||
for _, match := range matches {
|
||||
info, err := os.Stat(match)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
||||
_ = os.Remove(match)
|
||||
continue
|
||||
}
|
||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
||||
entries[match] = info.ModTime()
|
||||
}
|
||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
||||
if len(list) > 3 {
|
||||
for _, old := range list[3:] {
|
||||
_ = os.Remove(old.path)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||
ordered := make([]string, 0, len(entries))
|
||||
for path := range entries {
|
||||
ordered = append(ordered, path)
|
||||
}
|
||||
return nil
|
||||
sort.Slice(ordered, func(i, j int) bool {
|
||||
return entries[ordered[i]].After(entries[ordered[j]])
|
||||
})
|
||||
return ordered
|
||||
}
|
||||
|
||||
func writeJournalDump(dst string) error {
|
||||
@@ -152,6 +288,24 @@ func writeCommandOutput(dst string, cmd []string) error {
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func copyOptionalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
@@ -214,6 +368,58 @@ func copyDirContents(srcDir, dstDir string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
|
||||
}
|
||||
|
||||
func normalizeSupportBundleAuditJSON(path string) error {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
normalized, err := ApplySATOverlay(data)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return os.WriteFile(path, normalized, 0644)
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
entries, err := os.ReadDir(srcDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
src := filepath.Join(srcDir, entry.Name())
|
||||
dst := filepath.Join(dstDir, entry.Name())
|
||||
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyPath(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
@@ -254,6 +460,36 @@ func copyPath(src, dst string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(rootSrc, src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if keep != nil && !keep(rel, info) {
|
||||
return nil
|
||||
}
|
||||
if info.IsDir() {
|
||||
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||
return err
|
||||
}
|
||||
entries, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return copyPath(src, dst)
|
||||
}
|
||||
|
||||
func createSupportTarGz(dst, srcDir string) error {
|
||||
file, err := os.Create(dst)
|
||||
if err != nil {
|
||||
|
||||
252
audit/internal/collector/amdgpu.go
Normal file
252
audit/internal/collector/amdgpu.go
Normal file
@@ -0,0 +1,252 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
amdSMIExecCommand = exec.Command
|
||||
amdSMILookPath = exec.LookPath
|
||||
amdSMIGlob = filepath.Glob
|
||||
)
|
||||
|
||||
var amdSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
"/usr/local/bin/rocm-smi",
|
||||
}
|
||||
|
||||
type amdGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
Product string
|
||||
Firmware string
|
||||
PowerW *float64
|
||||
TempC *float64
|
||||
}
|
||||
|
||||
func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
if !hasAMDGPUDevices(devs) {
|
||||
return devs
|
||||
}
|
||||
infoByBDF, err := queryAMDGPUs()
|
||||
if err != nil {
|
||||
slog.Info("amdgpu: enrichment skipped", "err", err)
|
||||
return devs
|
||||
}
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(info.Serial) != "" {
|
||||
devs[i].SerialNumber = &info.Serial
|
||||
}
|
||||
if strings.TrimSpace(info.Firmware) != "" {
|
||||
devs[i].Firmware = &info.Firmware
|
||||
}
|
||||
if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil {
|
||||
devs[i].Model = &info.Product
|
||||
}
|
||||
if info.PowerW != nil {
|
||||
devs[i].PowerW = info.PowerW
|
||||
}
|
||||
if info.TempC != nil {
|
||||
devs[i].TemperatureC = info.TempC
|
||||
}
|
||||
enriched++
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("amdgpu: enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
for _, dev := range devs {
|
||||
if isAMDGPUDevice(dev) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
busByCard, err := queryAMDField("--showbus")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
infoByCard := map[string]amdGPUInfo{}
|
||||
for card, bus := range busByCard {
|
||||
bdf := normalizePCIeBDF(bus)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
infoByCard[card] = amdGPUInfo{BDF: bdf}
|
||||
}
|
||||
if len(infoByCard) == 0 {
|
||||
return map[string]amdGPUInfo{}, nil
|
||||
}
|
||||
mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value })
|
||||
mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value })
|
||||
mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value })
|
||||
mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value })
|
||||
mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value })
|
||||
|
||||
result := make(map[string]amdGPUInfo, len(infoByCard))
|
||||
for _, info := range infoByCard {
|
||||
if info.BDF == "" {
|
||||
continue
|
||||
}
|
||||
result[info.BDF] = info
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) {
|
||||
values, err := queryAMDNumericField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func queryAMDField(flag string) (map[string]string, error) {
|
||||
cmd, err := resolveAMDSMICmd(flag, "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parseROCmSingleValueCSV(string(out)), nil
|
||||
}
|
||||
|
||||
func queryAMDNumericField(flag string) (map[string]float64, error) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := map[string]float64{}
|
||||
for card, raw := range values {
|
||||
if value, ok := firstFloat(raw); ok {
|
||||
out[card] = value
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func resolveAMDSMICmd(args ...string) ([]string, error) {
|
||||
if path, err := amdSMILookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
for _, pattern := range amdSMIExecutableGlobs {
|
||||
matches, err := amdSMIGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
return append([]string{match}, args...), nil
|
||||
}
|
||||
}
|
||||
return nil, exec.ErrNotFound
|
||||
}
|
||||
|
||||
func parseROCmSingleValueCSV(raw string) map[string]string {
|
||||
rows := map[string]string{}
|
||||
reader := csv.NewReader(strings.NewReader(raw))
|
||||
reader.FieldsPerRecord = -1
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return rows
|
||||
}
|
||||
for _, rec := range records {
|
||||
if len(rec) < 2 {
|
||||
continue
|
||||
}
|
||||
card := normalizeROCmCardKey(rec[0])
|
||||
if card == "" {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Join(rec[1:], ","))
|
||||
if value == "" || looksLikeCSVHeaderValue(value) {
|
||||
continue
|
||||
}
|
||||
rows[card] = value
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
func normalizeROCmCardKey(raw string) string {
|
||||
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||
raw = strings.Trim(raw, "\"")
|
||||
if raw == "" {
|
||||
return ""
|
||||
}
|
||||
if raw == "device" || raw == "gpu" || raw == "card" {
|
||||
return ""
|
||||
}
|
||||
if strings.HasPrefix(raw, "card") {
|
||||
return raw
|
||||
}
|
||||
if _, err := strconv.Atoi(raw); err == nil {
|
||||
return "card" + raw
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeCSVHeaderValue(value string) bool {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
return strings.Contains(value, "product") ||
|
||||
strings.Contains(value, "serial") ||
|
||||
strings.Contains(value, "vbios") ||
|
||||
strings.Contains(value, "bus")
|
||||
}
|
||||
56
audit/internal/collector/amdgpu_test.go
Normal file
56
audit/internal/collector/amdgpu_test.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseROCmSingleValueCSV(t *testing.T) {
|
||||
raw := "device,Serial Number\ncard0,ABC123\ncard1,XYZ789\n"
|
||||
got := parseROCmSingleValueCSV(raw)
|
||||
if got["card0"] != "ABC123" {
|
||||
t.Fatalf("card0=%q want ABC123", got["card0"])
|
||||
}
|
||||
if got["card1"] != "XYZ789" {
|
||||
t.Fatalf("card1=%q want XYZ789", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryAMDNumericFieldParsesUnits(t *testing.T) {
|
||||
origExec := amdSMIExecCommand
|
||||
origLookPath := amdSMILookPath
|
||||
t.Cleanup(func() {
|
||||
amdSMIExecCommand = origExec
|
||||
amdSMILookPath = origLookPath
|
||||
})
|
||||
|
||||
amdSMILookPath = func(string) (string, error) { return "/usr/bin/rocm-smi", nil }
|
||||
amdSMIExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "printf 'device,Temperature\\ncard0,45.5c\\ncard1,67.0c\\n'")
|
||||
}
|
||||
|
||||
got, err := queryAMDNumericField("--showtemp")
|
||||
if err != nil {
|
||||
t.Fatalf("queryAMDNumericField: %v", err)
|
||||
}
|
||||
if got["card0"] != 45.5 {
|
||||
t.Fatalf("card0=%v want 45.5", got["card0"])
|
||||
}
|
||||
if got["card1"] != 67.0 {
|
||||
t.Fatalf("card1=%v want 67.0", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeROCmCardKey(t *testing.T) {
|
||||
tests := map[string]string{
|
||||
"0": "card0",
|
||||
"card1": "card1",
|
||||
"Device": "",
|
||||
"": "",
|
||||
}
|
||||
for input, want := range tests {
|
||||
if got := normalizeROCmCardKey(input); got != want {
|
||||
t.Fatalf("normalizeROCmCardKey(%q)=%q want %q", input, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -36,6 +36,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
package collector
|
||||
|
||||
import "bee/audit/internal/schema"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
finalizeSnapshot(snap, collectedAt)
|
||||
}
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
snap.Storage = filterStorage(snap.Storage)
|
||||
snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
@@ -33,15 +41,40 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
|
||||
continue
|
||||
}
|
||||
out = append(out, disk)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
out := make([]schema.HardwarePCIeDevice, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
|
||||
continue
|
||||
}
|
||||
out = append(out, dev)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
|
||||
hasIdentity := false
|
||||
switch {
|
||||
case psu.SerialNumber != nil && *psu.SerialNumber != "":
|
||||
hasIdentity = true
|
||||
case psu.Slot != nil && *psu.Slot != "":
|
||||
hasIdentity = true
|
||||
case psu.Model != nil && *psu.Model != "":
|
||||
hasIdentity = true
|
||||
case psu.Vendor != nil && *psu.Vendor != "":
|
||||
hasIdentity = true
|
||||
}
|
||||
if !hasIdentity {
|
||||
continue
|
||||
}
|
||||
out = append(out, psu)
|
||||
|
||||
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
present := true
|
||||
status := statusOK
|
||||
serial := "SN-1"
|
||||
virtualModel := "Virtual HDisk1"
|
||||
realModel := "PASCARI"
|
||||
coProcessorClass := "Co-processor"
|
||||
gpuClass := "VideoController"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Memory: []schema.HardwareMemory{
|
||||
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PowerSupplies: []schema.HardwarePowerSupply{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||
}
|
||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||
}
|
||||
if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
|
||||
t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
|
||||
}
|
||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||
}
|
||||
@@ -61,3 +74,20 @@ func TestFinalizeSnapshotPreservesDuplicateSerials(t *testing.T) {
|
||||
t.Fatalf("duplicate serial should stay unchanged: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterPSUsKeepsSlotOnlyEntries(t *testing.T) {
|
||||
slot := "0"
|
||||
status := statusOK
|
||||
|
||||
got := filterPSUs([]schema.HardwarePowerSupply{
|
||||
{Slot: &slot, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
})
|
||||
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||
t.Fatalf("unexpected kept PSU: %+v", got[0])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,18 +2,21 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
mstflintQuery = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -21,7 +24,7 @@ var (
|
||||
}
|
||||
|
||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -29,6 +32,14 @@ var (
|
||||
}
|
||||
|
||||
netIfacesByBDF = listNetIfacesByBDF
|
||||
readNetCarrierFile = func(iface string) (string, error) {
|
||||
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(string(raw)), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
|
||||
}
|
||||
return ifaces
|
||||
}
|
||||
|
||||
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
return exec.CommandContext(ctx, name, args...).Output()
|
||||
}
|
||||
|
||||
func interfaceHasCarrier(iface string) bool {
|
||||
raw, err := readNetCarrierFile(iface)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(raw) == "1"
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
|
||||
var (
|
||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -44,6 +44,11 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
iface := ifaces[0]
|
||||
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
|
||||
if devs[i].SerialNumber == nil {
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
}
|
||||
}
|
||||
|
||||
if devs[i].Firmware == nil {
|
||||
if out, err := ethtoolInfoQuery(iface); err == nil {
|
||||
@@ -53,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
}
|
||||
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
if interfaceHasCarrier(iface) {
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseSFPDOM(t *testing.T) {
|
||||
raw := `
|
||||
@@ -29,6 +33,113 @@ func TestParseSFPDOM(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLSPCIDetailSerial(t *testing.T) {
|
||||
raw := `
|
||||
05:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
|
||||
Serial number: NIC-SN-12345
|
||||
`
|
||||
if got := parseLSPCIDetailSerial(raw); got != "NIC-SN-12345" {
|
||||
t.Fatalf("serial=%q want %q", got, "NIC-SN-12345")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePCIVPDSerial(t *testing.T) {
|
||||
raw := []byte{0x82, 0x05, 0x00, 'M', 'L', 'X', '5', 0x90, 0x08, 0x00, 'S', 'N', 0x08, 'M', 'T', '1', '2', '3', '4', '5', '6'}
|
||||
if got := parsePCIVPDSerial(raw); got != "MT123456" {
|
||||
t.Fatalf("serial=%q want %q", got, "MT123456")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:18:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: NIC-SN-98765\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(iface string) (string, error) {
|
||||
if iface != "eth0" {
|
||||
t.Fatalf("unexpected iface: %s", iface)
|
||||
}
|
||||
return "aa:bb:cc:dd:ee:ff", nil
|
||||
}
|
||||
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "NIC-SN-98765" {
|
||||
t.Fatalf("serial=%v want NIC-SN-98765", out[0].SerialNumber)
|
||||
}
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) {
|
||||
t.Fatal("ethtool -m should not be called without carrier")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDBMValue(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
|
||||
@@ -13,14 +13,18 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 13 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
|
||||
@@ -37,7 +37,7 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
val := strings.TrimSpace(line[idx+2:])
|
||||
fields[key] = val
|
||||
}
|
||||
if !shouldIncludePCIeDevice(fields["Class"]) {
|
||||
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
|
||||
continue
|
||||
}
|
||||
dev := parseLspciDevice(fields)
|
||||
@@ -46,8 +46,10 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldIncludePCIeDevice(class string) bool {
|
||||
func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
c := strings.ToLower(strings.TrimSpace(class))
|
||||
v := strings.ToLower(strings.TrimSpace(vendor))
|
||||
d := strings.ToLower(strings.TrimSpace(device))
|
||||
if c == "" {
|
||||
return true
|
||||
}
|
||||
@@ -57,6 +59,7 @@ func shouldIncludePCIeDevice(class string) bool {
|
||||
"host bridge",
|
||||
"isa bridge",
|
||||
"pci bridge",
|
||||
"co-processor",
|
||||
"performance counter",
|
||||
"performance counters",
|
||||
"ram memory",
|
||||
@@ -68,12 +71,28 @@ func shouldIncludePCIeDevice(class string) bool {
|
||||
"audio device",
|
||||
"serial bus controller",
|
||||
"unassigned class",
|
||||
"non-essential instrumentation",
|
||||
}
|
||||
for _, bad := range excluded {
|
||||
if strings.Contains(c, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
"reserved spp",
|
||||
"ptdma",
|
||||
"cryptographic coprocessor pspcpp",
|
||||
"pspcpp",
|
||||
}
|
||||
for _, bad := range internalAMDPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -98,6 +117,8 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
@@ -165,6 +186,18 @@ func readPCINumaNode(bdf string) (int, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
func parsePCINumaNode(raw string) (int, bool) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return 0, false
|
||||
}
|
||||
value, err := strconv.Atoi(raw)
|
||||
if err != nil || value < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
||||
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||
if err != nil {
|
||||
|
||||
@@ -8,32 +8,43 @@ import (
|
||||
|
||||
func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
tests := []struct {
|
||||
class string
|
||||
want bool
|
||||
name string
|
||||
class string
|
||||
vendor string
|
||||
device string
|
||||
want bool
|
||||
}{
|
||||
{"USB controller", false},
|
||||
{"System peripheral", false},
|
||||
{"Audio device", false},
|
||||
{"Host bridge", false},
|
||||
{"PCI bridge", false},
|
||||
{"SMBus", false},
|
||||
{"Performance counters", false},
|
||||
{"Ethernet controller", true},
|
||||
{"RAID bus controller", true},
|
||||
{"Non-Volatile memory controller", true},
|
||||
{"VGA compatible controller", true},
|
||||
{name: "usb", class: "USB controller", want: false},
|
||||
{name: "system peripheral", class: "System peripheral", want: false},
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "co-processor", class: "Co-processor", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
{name: "amd dummy function", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse PTDMA", want: false},
|
||||
{name: "amd pspcpp", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse Cryptographic Coprocessor PSPCPP", want: false},
|
||||
{name: "ethernet", class: "Ethernet controller", want: true},
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := shouldIncludePCIeDevice(tt.class)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class %q include=%v want %v", tt.class, got, tt.want)
|
||||
}
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := shouldIncludePCIeDevice(tt.class, tt.vendor, tt.device)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class=%q vendor=%q device=%q include=%v want %v", tt.class, tt.vendor, tt.device, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" +
|
||||
"Slot:\t0000:00:18.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
@@ -51,6 +62,35 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:1a:00.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:1a:00.2\nClass:\tEncryption controller\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PTDMA\n\n" +
|
||||
"Slot:\t0000:05:00.0\nClass:\tEthernet controller\nVendor:\tMellanox Technologies\nDevice:\tMT28908 Family [ConnectX-6]\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "MT28908 Family [ConnectX-6]" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersCoProcessors(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "H100" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
@@ -68,6 +108,18 @@ func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspciUsesNUMANodeFieldWhenSysfsUnavailable(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tEthernet controller\nVendor:\tIntel Corporation\nDevice:\tX710\nNUMANode:\t1\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].NUMANode == nil || *devs[0].NUMANode != 1 {
|
||||
t.Fatalf("numa_node=%v want 1", devs[0].NUMANode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
tests := []struct {
|
||||
raw string
|
||||
|
||||
123
audit/internal/collector/pcie_identity.go
Normal file
123
audit/internal/collector/pcie_identity.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("lspci", "-vv", "-s", bdf).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
readPCIVPDFile = func(bdf string) ([]byte, error) {
|
||||
return os.ReadFile(filepath.Join("/sys/bus/pci/devices", bdf, "vpd"))
|
||||
}
|
||||
)
|
||||
|
||||
func enrichPCIeWithPCISerials(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !shouldProbePCIeSerial(devs[i]) {
|
||||
continue
|
||||
}
|
||||
bdf := normalizePCIeBDF(*devs[i].BDF)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
enriched++
|
||||
}
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("pcie: serials enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldProbePCIeSerial(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.BDF == nil || dev.SerialNumber != nil {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
return isNICClass(class) || isGPUClass(class)
|
||||
}
|
||||
|
||||
func queryPCIDeviceSerial(bdf string) string {
|
||||
if out, err := queryPCILSPCIDetail(bdf); err == nil {
|
||||
if serial := parseLSPCIDetailSerial(out); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
if raw, err := readPCIVPDFile(bdf); err == nil {
|
||||
return parsePCIVPDSerial(raw)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseLSPCIDetailSerial(raw string) string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
lower := strings.ToLower(line)
|
||||
if !strings.Contains(lower, "serial number:") {
|
||||
continue
|
||||
}
|
||||
idx := strings.Index(line, ":")
|
||||
if idx < 0 {
|
||||
continue
|
||||
}
|
||||
if serial := strings.TrimSpace(line[idx+1:]); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parsePCIVPDSerial(raw []byte) string {
|
||||
for i := 0; i+3 < len(raw); i++ {
|
||||
if raw[i] != 'S' || raw[i+1] != 'N' {
|
||||
continue
|
||||
}
|
||||
length := int(raw[i+2])
|
||||
if length <= 0 || length > 64 || i+3+length > len(raw) {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Trim(string(raw[i+3:i+3+length]), "\x00"))
|
||||
if !looksLikeSerial(value) {
|
||||
continue
|
||||
}
|
||||
return value
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeSerial(value string) bool {
|
||||
if len(value) < 4 {
|
||||
return false
|
||||
}
|
||||
hasAlphaNum := false
|
||||
for _, r := range value {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9':
|
||||
hasAlphaNum = true
|
||||
case strings.ContainsRune(" -_./:", r):
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return hasAlphaNum
|
||||
}
|
||||
47
audit/internal/collector/pcie_identity_test.go
Normal file
47
audit/internal/collector/pcie_identity_test.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEnrichPCIeWithPCISerialsAddsGPUFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:11:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: GPU-SN-12345\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
bdf := "0000:11:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithPCISerials(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-SN-12345" {
|
||||
t.Fatalf("serial=%v want GPU-SN-12345", out[0].SerialNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldProbePCIeSerialSkipsNonGPUOrNIC(t *testing.T) {
|
||||
class := "StorageController"
|
||||
bdf := "0000:19:00.0"
|
||||
dev := schema.HardwarePCIeDevice{DeviceClass: &class, BDF: &bdf}
|
||||
if shouldProbePCIeSerial(dev) {
|
||||
t.Fatal("unexpected probe for storage controller")
|
||||
}
|
||||
}
|
||||
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
if isVirtualBMCDisk(dev) {
|
||||
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||
continue
|
||||
}
|
||||
disks = append(disks, dev)
|
||||
}
|
||||
return disks
|
||||
}
|
||||
|
||||
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||
// These have zero reported size, a generic fake serial, and a model name that
|
||||
// starts with "Virtual HDisk".
|
||||
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||
return isVirtualHDiskModel(dev.Model)
|
||||
}
|
||||
|
||||
func isVirtualHDiskModel(model string) bool {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
return strings.HasPrefix(model, "virtual hdisk")
|
||||
}
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
@@ -190,6 +207,7 @@ type smartctlInfo struct {
|
||||
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -348,6 +366,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Present: &present,
|
||||
Type: &devType,
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
@@ -9,8 +9,50 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
var exportExecCommand = exec.Command
|
||||
|
||||
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
|
||||
func removableTargetReadOnly(fields map[string]string) bool {
|
||||
if fields["RO"] == "1" {
|
||||
return true
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
|
||||
case "iso9660", "squashfs":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func ensureWritableMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -34,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if !removable || fields["FSTYPE"] == "" {
|
||||
if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -52,7 +94,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
|
||||
if src == "" || target.Device == "" {
|
||||
return "", fmt.Errorf("source and target are required")
|
||||
}
|
||||
@@ -62,20 +104,43 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
|
||||
mountpoint := strings.TrimSpace(target.Mountpoint)
|
||||
mountedHere := false
|
||||
mounted := mountpoint != ""
|
||||
if mountpoint == "" {
|
||||
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
_ = os.Remove(mountpoint)
|
||||
return string(raw), err
|
||||
return "", formatMountTargetError(target, string(raw), err)
|
||||
}
|
||||
mountedHere = true
|
||||
mounted = true
|
||||
}
|
||||
defer func() {
|
||||
if !mounted {
|
||||
return
|
||||
}
|
||||
_ = exportExecCommand("sync").Run()
|
||||
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
retErr = err
|
||||
} else {
|
||||
retErr = fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
}
|
||||
if mountedHere {
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := ensureWritableMountpoint(mountpoint); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
filename := filepath.Base(src)
|
||||
dst := filepath.Join(mountpoint, filename)
|
||||
dst = filepath.Join(mountpoint, filename)
|
||||
data, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -83,12 +148,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
if err := os.WriteFile(dst, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = exec.Command("sync").Run()
|
||||
|
||||
if mountedHere {
|
||||
_ = exec.Command("umount", mountpoint).Run()
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
112
audit/internal/platform/export_test.go
Normal file
112
audit/internal/platform/export_test.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
|
||||
var calls [][]string
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
calls = append(calls, append([]string{name}, args...))
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
dst, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportFileToTarget error: %v", err)
|
||||
}
|
||||
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
|
||||
t.Fatalf("dst=%q want %q", got, want)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
|
||||
t.Fatalf("exported file missing: %v", err)
|
||||
}
|
||||
|
||||
foundUmount := false
|
||||
for _, call := range calls {
|
||||
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
|
||||
foundUmount = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundUmount {
|
||||
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
if err := os.Chmod(mountpoint, 0555); err != nil {
|
||||
t.Fatalf("chmod mountpoint: %v", err)
|
||||
}
|
||||
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
_, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-writable mountpoint")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "target filesystem is not writable") {
|
||||
t.Fatalf("err=%q want writable message", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
|
||||
oldExec := exportExecCommand
|
||||
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
|
||||
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
|
||||
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
|
||||
return cmd
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
targets, err := s.ListRemovableTargets()
|
||||
if err != nil {
|
||||
t.Fatalf("ListRemovableTargets error: %v", err)
|
||||
}
|
||||
if len(targets) != 1 {
|
||||
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
|
||||
}
|
||||
if got := targets[0].Device; got != "/dev/sdb1" {
|
||||
t.Fatalf("device=%q want /dev/sdb1", got)
|
||||
}
|
||||
}
|
||||
@@ -13,18 +13,19 @@ import (
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
ElapsedSec float64
|
||||
GPUIndex int
|
||||
TempC float64
|
||||
UsagePct float64
|
||||
PowerW float64
|
||||
ClockMHz float64
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics",
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
@@ -45,16 +46,17 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 5 {
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
rows = append(rows, GPUMetricRow{
|
||||
GPUIndex: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
PowerW: parseGPUFloat(parts[3]),
|
||||
ClockMHz: parseGPUFloat(parts[4]),
|
||||
GPUIndex: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
MemUsagePct: parseGPUFloat(parts[3]),
|
||||
PowerW: parseGPUFloat(parts[4]),
|
||||
ClockMHz: parseGPUFloat(parts[5]),
|
||||
})
|
||||
}
|
||||
return rows, nil
|
||||
@@ -69,6 +71,71 @@ func parseGPUFloat(s string) float64 {
|
||||
return v
|
||||
}
|
||||
|
||||
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
return sampleGPUMetrics(gpuIndices)
|
||||
}
|
||||
|
||||
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||
if len(lines) < 2 {
|
||||
return nil, fmt.Errorf("rocm-smi: insufficient output")
|
||||
}
|
||||
|
||||
// Parse header to find column indices by name.
|
||||
headers := strings.Split(lines[0], ",")
|
||||
colIdx := func(keywords ...string) int {
|
||||
for i, h := range headers {
|
||||
hl := strings.ToLower(strings.TrimSpace(h))
|
||||
for _, kw := range keywords {
|
||||
if strings.Contains(hl, kw) {
|
||||
return i
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
|
||||
idxUse := colIdx("gpu use (%)")
|
||||
idxMem := colIdx("vram%", "memory allocated")
|
||||
idxPow := colIdx("average graphics package power", "power (w)")
|
||||
|
||||
var rows []GPUMetricRow
|
||||
for _, line := range lines[1:] {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
idx := len(rows)
|
||||
row := GPUMetricRow{GPUIndex: idx}
|
||||
get := func(i int) float64 {
|
||||
if i < 0 || i >= len(parts) {
|
||||
return 0
|
||||
}
|
||||
v := strings.TrimSpace(parts[i])
|
||||
if strings.EqualFold(v, "n/a") {
|
||||
return 0
|
||||
}
|
||||
return parseGPUFloat(v)
|
||||
}
|
||||
row.TempC = get(idxTemp)
|
||||
row.UsagePct = get(idxUse)
|
||||
row.MemUsagePct = get(idxMem)
|
||||
row.PowerW = get(idxPow)
|
||||
rows = append(rows, row)
|
||||
}
|
||||
if len(rows) == 0 {
|
||||
return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
@@ -327,7 +394,7 @@ const (
|
||||
)
|
||||
|
||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||
// Suitable for display in the TUI screenOutput.
|
||||
// Used in SAT stress-test logs.
|
||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
|
||||
269
audit/internal/platform/install.go
Normal file
269
audit/internal/platform/install.go
Normal file
@@ -0,0 +1,269 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
||||
|
||||
// ListInstallDisks returns block devices suitable for installation.
|
||||
// Excludes the current live boot medium but includes USB drives.
|
||||
func (s *System) ListInstallDisks() ([]InstallDisk, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,MODEL,SIZE,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("lsblk: %w", err)
|
||||
}
|
||||
|
||||
bootDev := findLiveBootDevice()
|
||||
|
||||
var disks []InstallDisk
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
// NAME MODEL SIZE TYPE TRAN — model may have spaces so we parse from end
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
// Last field: TRAN, second-to-last: TYPE, third-to-last: SIZE
|
||||
typ := fields[len(fields)-2]
|
||||
size := fields[len(fields)-3]
|
||||
name := fields[0]
|
||||
model := strings.Join(fields[1:len(fields)-3], " ")
|
||||
|
||||
if typ != "disk" {
|
||||
continue
|
||||
}
|
||||
|
||||
device := "/dev/" + name
|
||||
if device == bootDev {
|
||||
continue
|
||||
}
|
||||
|
||||
sizeBytes := diskSizeBytes(device)
|
||||
mounted := mountedParts(device)
|
||||
|
||||
disks = append(disks, InstallDisk{
|
||||
Device: device,
|
||||
Model: strings.TrimSpace(model),
|
||||
Size: size,
|
||||
SizeBytes: sizeBytes,
|
||||
MountedParts: mounted,
|
||||
})
|
||||
}
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
// diskSizeBytes returns the byte size of a block device using lsblk.
|
||||
func diskSizeBytes(device string) int64 {
|
||||
out, err := exec.Command("lsblk", "-bdn", "-o", "SIZE", device).Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64)
|
||||
return n
|
||||
}
|
||||
|
||||
// mountedParts returns a list of "<part> at <mountpoint>" strings for any
|
||||
// mounted partitions on the given device.
|
||||
func mountedParts(device string) []string {
|
||||
out, err := exec.Command("lsblk", "-n", "-o", "NAME,MOUNTPOINT", device).Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var result []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
mp := fields[1]
|
||||
if mp == "" || mp == "[SWAP]" {
|
||||
continue
|
||||
}
|
||||
result = append(result, "/dev/"+strings.TrimLeft(fields[0], "└─├─")+" at "+mp)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// findLiveBootDevice returns the block device backing /run/live/medium (if any).
|
||||
func findLiveBootDevice() string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
src := strings.TrimSpace(string(out))
|
||||
if src == "" {
|
||||
return ""
|
||||
}
|
||||
// Strip partition suffix to get the whole disk device.
|
||||
// e.g. /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||
out2, err := exec.Command("lsblk", "-no", "PKNAME", src).Output()
|
||||
if err != nil || strings.TrimSpace(string(out2)) == "" {
|
||||
return src
|
||||
}
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
func mountSource(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func mountFSType(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceType(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceTransport(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
switch {
|
||||
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||
return "ram"
|
||||
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||
return "cdrom"
|
||||
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||
return "usb"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||
return "cdrom"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||
return "disk"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
func MinInstallBytes() int64 {
|
||||
fi, err := os.Stat(squashfsPath)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return fi.Size() * 3 / 2
|
||||
}
|
||||
|
||||
// toramActive returns true when the live system was booted with toram.
|
||||
func toramActive() bool {
|
||||
data, err := os.ReadFile("/proc/cmdline")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.Contains(string(data), "toram")
|
||||
}
|
||||
|
||||
// freeMemBytes returns MemAvailable from /proc/meminfo.
|
||||
func freeMemBytes() int64 {
|
||||
data, err := os.ReadFile("/proc/meminfo")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
if strings.HasPrefix(line, "MemAvailable:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
n, _ := strconv.ParseInt(fields[1], 10, 64)
|
||||
return n * 1024 // kB → bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// DiskWarnings returns advisory warning strings for a disk candidate.
|
||||
func DiskWarnings(d InstallDisk) []string {
|
||||
var w []string
|
||||
if len(d.MountedParts) > 0 {
|
||||
w = append(w, "has mounted partitions: "+strings.Join(d.MountedParts, ", "))
|
||||
}
|
||||
min := MinInstallBytes()
|
||||
if min > 0 && d.SizeBytes > 0 && d.SizeBytes < min {
|
||||
w = append(w, fmt.Sprintf("disk may be too small (need ≥ %s, have %s)",
|
||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||
}
|
||||
if toramActive() {
|
||||
sqFi, err := os.Stat(squashfsPath)
|
||||
if err == nil {
|
||||
free := freeMemBytes()
|
||||
if free > 0 && free < sqFi.Size()*2 {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
}
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
func humanBytes(b int64) string {
|
||||
const unit = 1024
|
||||
if b < unit {
|
||||
return fmt.Sprintf("%d B", b)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := b / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
// InstallToDisk runs bee-install <device> <logfile> and streams output to logFile.
|
||||
// The context can be used to cancel.
|
||||
func (s *System) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
cmd := exec.CommandContext(ctx, "bee-install", device, logFile)
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// InstallLogPath returns the default install log path for a given device.
|
||||
func InstallLogPath(device string) string {
|
||||
safe := strings.NewReplacer("/", "_", " ", "_").Replace(device)
|
||||
return "/tmp/bee-install" + safe + ".log"
|
||||
}
|
||||
|
||||
// Label returns a display label for a disk.
|
||||
func (d InstallDisk) Label() string {
|
||||
model := d.Model
|
||||
if model == "" {
|
||||
model = "Unknown"
|
||||
}
|
||||
return fmt.Sprintf("%s %s %s", d.Device, d.Size, model)
|
||||
}
|
||||
220
audit/internal/platform/install_to_ram.go
Normal file
220
audit/internal/platform/install_to_ram.go
Normal file
@@ -0,0 +1,220 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.EqualFold(fsType, "tmpfs")
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
source := mountSource("/run/live/medium")
|
||||
device := findLiveBootDevice()
|
||||
status := LiveBootSource{
|
||||
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||
Source: source,
|
||||
Device: device,
|
||||
}
|
||||
if fsType == "" && source == "" && device == "" {
|
||||
if toramActive() {
|
||||
status.InRAM = true
|
||||
status.Kind = "ram"
|
||||
status.Source = "tmpfs"
|
||||
return status
|
||||
}
|
||||
status.Kind = "unknown"
|
||||
return status
|
||||
}
|
||||
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||
if status.Kind == "" {
|
||||
status.Kind = "unknown"
|
||||
}
|
||||
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||
status.Source = "tmpfs"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
if s.IsLiveMediaInRAM() {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
|
||||
dstDir := "/dev/shm/bee-live"
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(sf)
|
||||
dst := filepath.Join(dstDir, base)
|
||||
log(fmt.Sprintf("Copying %s to RAM...", base))
|
||||
if err := copyFileLarge(ctx, sf, dst, log); err != nil {
|
||||
return fmt.Errorf("copy %s: %v", base, err)
|
||||
}
|
||||
log(fmt.Sprintf("Copied %s.", base))
|
||||
|
||||
loopDev, err := findLoopForFile(sf)
|
||||
if err != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
|
||||
continue
|
||||
}
|
||||
if err := reassociateLoopDevice(loopDev, dst); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||
}
|
||||
|
||||
log("Done. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
fi, err := in.Stat()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
n, err := in.Read(buf)
|
||||
if n > 0 {
|
||||
if _, werr := out.Write(buf[:n]); werr != nil {
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
}
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rel, _ := filepath.Rel(src, path)
|
||||
target := filepath.Join(dst, rel)
|
||||
if fi.IsDir() {
|
||||
return os.MkdirAll(target, fi.Mode())
|
||||
}
|
||||
if strings.HasSuffix(path, ".squashfs") {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(target); err == nil {
|
||||
return nil
|
||||
}
|
||||
return copyFileLarge(ctx, path, target, nil)
|
||||
})
|
||||
}
|
||||
|
||||
func findLoopForFile(backingFile string) (string, error) {
|
||||
out, err := exec.Command("losetup", "--list", "--json").Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Name string `json:"name"`
|
||||
BackFile string `json:"back-file"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, dev := range result.Loopdevices {
|
||||
if dev.BackFile == backingFile {
|
||||
return dev.Name, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||
}
|
||||
|
||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||
return nil
|
||||
}
|
||||
return loopChangeFD(loopDev, newFile)
|
||||
}
|
||||
28
audit/internal/platform/install_to_ram_linux.go
Normal file
28
audit/internal/platform/install_to_ram_linux.go
Normal file
@@ -0,0 +1,28 @@
|
||||
//go:build linux
|
||||
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
const ioctlLoopChangeFD = 0x4C08
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer lf.Close()
|
||||
nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer nf.Close()
|
||||
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
|
||||
if errno != 0 {
|
||||
return errno
|
||||
}
|
||||
return nil
|
||||
}
|
||||
9
audit/internal/platform/install_to_ram_other.go
Normal file
9
audit/internal/platform/install_to_ram_other.go
Normal file
@@ -0,0 +1,9 @@
|
||||
//go:build !linux
|
||||
|
||||
package platform
|
||||
|
||||
import "errors"
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||
}
|
||||
28
audit/internal/platform/install_to_ram_test.go
Normal file
28
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
source string
|
||||
deviceType string
|
||||
transport string
|
||||
want string
|
||||
}{
|
||||
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
64
audit/internal/platform/kill_workers.go
Normal file
64
audit/internal/platform/kill_workers.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
// bee test worker processes that should be killed by KillTestWorkers.
|
||||
var workerPatterns = []string{
|
||||
"bee-gpu-burn",
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
}
|
||||
|
||||
// KilledProcess describes a process that was sent SIGKILL.
|
||||
type KilledProcess struct {
|
||||
PID int `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
pid, err := strconv.Atoi(e.Name())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// /proc/*/cmdline uses NUL bytes as argument separators.
|
||||
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
||||
exe := strings.TrimSpace(args[0])
|
||||
base := exe
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
328
audit/internal/platform/live_metrics.go
Normal file
328
audit/internal/platform/live_metrics.go
Normal file
@@ -0,0 +1,328 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
type TempReading struct {
|
||||
Name string `json:"name"`
|
||||
Group string `json:"group,omitempty"`
|
||||
Celsius float64 `json:"celsius"`
|
||||
}
|
||||
|
||||
// SampleLiveMetrics collects a single metrics snapshot from all available
|
||||
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
|
||||
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
|
||||
func SampleLiveMetrics() LiveMetricSample {
|
||||
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
||||
|
||||
// GPU metrics — try NVIDIA first, fall back to AMD
|
||||
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
|
||||
s.GPUs = gpus
|
||||
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
|
||||
s.GPUs = amdGPUs
|
||||
}
|
||||
|
||||
// Fan speeds — skipped silently if ipmitool unavailable
|
||||
fans, _ := sampleFanSpeeds()
|
||||
s.Fans = fans
|
||||
|
||||
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
|
||||
if !hasTempGroup(s.Temps, "cpu") {
|
||||
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
|
||||
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
|
||||
}
|
||||
}
|
||||
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
|
||||
// Memory load — from /proc/meminfo
|
||||
s.MemLoadPct = sampleMemLoadPct()
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||
// the overall CPU utilisation percentage.
|
||||
func sampleCPULoadPct() float64 {
|
||||
total0, idle0 := readCPUStat()
|
||||
if total0 == 0 {
|
||||
return 0
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
total1, idle1 := readCPUStat()
|
||||
if total1 == 0 {
|
||||
return 0
|
||||
}
|
||||
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||
}
|
||||
|
||||
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||
dt := float64(total - prevTotal)
|
||||
di := float64(idle - prevIdle)
|
||||
if dt <= 0 {
|
||||
return 0
|
||||
}
|
||||
pct := (1 - di/dt) * 100
|
||||
if pct < 0 {
|
||||
return 0
|
||||
}
|
||||
if pct > 100 {
|
||||
return 100
|
||||
}
|
||||
return pct
|
||||
}
|
||||
|
||||
func readCPUStat() (total, idle uint64) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
defer f.Close()
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if !strings.HasPrefix(line, "cpu ") {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)[1:] // skip "cpu"
|
||||
var vals [10]uint64
|
||||
for i := 0; i < len(fields) && i < 10; i++ {
|
||||
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
|
||||
}
|
||||
// idle = idle + iowait
|
||||
idle = vals[3] + vals[4]
|
||||
for _, v := range vals {
|
||||
total += v
|
||||
}
|
||||
return total, idle
|
||||
}
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
func sampleMemLoadPct() float64 {
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
defer f.Close()
|
||||
vals := map[string]uint64{}
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
fields := strings.Fields(sc.Text())
|
||||
if len(fields) >= 2 {
|
||||
v, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
vals[strings.TrimSuffix(fields[0], ":")] = v
|
||||
}
|
||||
}
|
||||
total := vals["MemTotal"]
|
||||
avail := vals["MemAvailable"]
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
used := total - avail
|
||||
return float64(used) / float64(total) * 100
|
||||
}
|
||||
|
||||
func hasTempGroup(temps []TempReading, group string) bool {
|
||||
for _, t := range temps {
|
||||
if t.Group == group {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func sampleLiveTemperatureReadings() []TempReading {
|
||||
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
|
||||
return temps
|
||||
}
|
||||
return sampleLiveTempsViaIPMI()
|
||||
}
|
||||
|
||||
func sampleLiveTempsViaSensorsJSON() []TempReading {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(out, &doc); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
chips := make([]string, 0, len(doc))
|
||||
for chip := range doc {
|
||||
chips = append(chips, chip)
|
||||
}
|
||||
sort.Strings(chips)
|
||||
|
||||
temps := make([]TempReading, 0, len(chips))
|
||||
seen := map[string]struct{}{}
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
featureNames := make([]string, 0, len(features))
|
||||
for name := range features {
|
||||
featureNames = append(featureNames, name)
|
||||
}
|
||||
sort.Strings(featureNames)
|
||||
for _, name := range featureNames {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
feature, ok := features[name].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
value, ok := firstTempInputValue(feature)
|
||||
if !ok || value <= 0 || value > 150 {
|
||||
continue
|
||||
}
|
||||
group := classifyLiveTempGroup(chip, name)
|
||||
if group == "gpu" {
|
||||
continue
|
||||
}
|
||||
label := strings.TrimSpace(name)
|
||||
if label == "" {
|
||||
continue
|
||||
}
|
||||
if group == "ambient" {
|
||||
label = compactAmbientTempName(chip, label)
|
||||
}
|
||||
key := group + "\x00" + label
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||
}
|
||||
}
|
||||
return temps
|
||||
}
|
||||
|
||||
func sampleLiveTempsViaIPMI() []TempReading {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
var temps []TempReading
|
||||
seen := map[string]struct{}{}
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
unit := strings.ToLower(strings.TrimSpace(parts[2]))
|
||||
if !strings.Contains(unit, "degrees") {
|
||||
continue
|
||||
}
|
||||
raw := strings.TrimSpace(parts[1])
|
||||
if raw == "" || strings.EqualFold(raw, "na") {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.ParseFloat(raw, 64)
|
||||
if err != nil || value <= 0 || value > 150 {
|
||||
continue
|
||||
}
|
||||
group := classifyLiveTempGroup("", name)
|
||||
if group == "gpu" {
|
||||
continue
|
||||
}
|
||||
label := name
|
||||
if group == "ambient" {
|
||||
label = compactAmbientTempName("", label)
|
||||
}
|
||||
key := group + "\x00" + label
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||
}
|
||||
return temps
|
||||
}
|
||||
|
||||
func firstTempInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
|
||||
continue
|
||||
}
|
||||
switch value := feature[key].(type) {
|
||||
case float64:
|
||||
return value, true
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(value, 64)
|
||||
if err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func classifyLiveTempGroup(chip, name string) string {
|
||||
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
|
||||
switch {
|
||||
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
|
||||
return "gpu"
|
||||
case strings.Contains(text, "coretemp"),
|
||||
strings.Contains(text, "k10temp"),
|
||||
strings.Contains(text, "zenpower"),
|
||||
strings.Contains(text, "package id"),
|
||||
strings.Contains(text, "x86_pkg_temp"),
|
||||
strings.Contains(text, "tctl"),
|
||||
strings.Contains(text, "tdie"),
|
||||
strings.Contains(text, "tccd"),
|
||||
strings.Contains(text, "cpu"),
|
||||
strings.Contains(text, "peci"):
|
||||
return "cpu"
|
||||
default:
|
||||
return "ambient"
|
||||
}
|
||||
}
|
||||
|
||||
func compactAmbientTempName(chip, name string) string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
name = strings.TrimSpace(name)
|
||||
if chip == "" || strings.EqualFold(chip, name) {
|
||||
return name
|
||||
}
|
||||
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
|
||||
return name
|
||||
}
|
||||
return chip + " / " + name
|
||||
}
|
||||
94
audit/internal/platform/live_metrics_test.go
Normal file
94
audit/internal/platform/live_metrics_test.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestFirstTempInputValue(t *testing.T) {
|
||||
feature := map[string]any{
|
||||
"temp1_input": 61.5,
|
||||
"temp1_max": 80.0,
|
||||
}
|
||||
got, ok := firstTempInputValue(feature)
|
||||
if !ok {
|
||||
t.Fatal("expected value")
|
||||
}
|
||||
if got != 61.5 {
|
||||
t.Fatalf("got %v want 61.5", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyLiveTempGroup(t *testing.T) {
|
||||
tests := []struct {
|
||||
chip string
|
||||
name string
|
||||
want string
|
||||
}{
|
||||
{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
|
||||
{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
|
||||
{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
|
||||
{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
|
||||
t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCompactAmbientTempName(t *testing.T) {
|
||||
if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPULoadPctBetween(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
prevTotal uint64
|
||||
prevIdle uint64
|
||||
total uint64
|
||||
idle uint64
|
||||
want float64
|
||||
}{
|
||||
{
|
||||
name: "busy half",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 90,
|
||||
want: 50,
|
||||
},
|
||||
{
|
||||
name: "fully busy",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 40,
|
||||
want: 100,
|
||||
},
|
||||
{
|
||||
name: "no progress",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 100,
|
||||
idle: 40,
|
||||
want: 0,
|
||||
},
|
||||
{
|
||||
name: "idle delta larger than total clamps to zero",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 150,
|
||||
want: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package platform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
|
||||
out := make([]InterfaceInfo, 0, len(names))
|
||||
for _, name := range names {
|
||||
state := "unknown"
|
||||
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
|
||||
fields := strings.Fields(string(raw))
|
||||
if len(fields) >= 9 {
|
||||
state = fields[8]
|
||||
if up, err := interfaceAdminState(name); err == nil {
|
||||
if up {
|
||||
state = "up"
|
||||
} else {
|
||||
state = "down"
|
||||
}
|
||||
}
|
||||
|
||||
var ipv4 []string
|
||||
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 4 {
|
||||
ipv4 = append(ipv4, fields[3])
|
||||
}
|
||||
}
|
||||
ipv4, err := interfaceIPv4Addrs(name)
|
||||
if err != nil {
|
||||
ipv4 = nil
|
||||
}
|
||||
|
||||
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
||||
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
|
||||
names, err := listInterfaceNames()
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
|
||||
snapshot := NetworkSnapshot{
|
||||
Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
|
||||
}
|
||||
for _, name := range names {
|
||||
up, err := interfaceAdminState(name)
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
ipv4, err := interfaceIPv4Addrs(name)
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
|
||||
Name: name,
|
||||
Up: up,
|
||||
IPv4: ipv4,
|
||||
})
|
||||
}
|
||||
|
||||
if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
|
||||
snapshot.ResolvConf = string(raw)
|
||||
}
|
||||
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
|
||||
var errs []string
|
||||
|
||||
for _, iface := range snapshot.Interfaces {
|
||||
if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
|
||||
continue
|
||||
}
|
||||
if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
|
||||
}
|
||||
for _, cidr := range iface.IPv4 {
|
||||
if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
|
||||
detail := strings.TrimSpace(string(raw))
|
||||
if detail != "" {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
|
||||
} else {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
state := "down"
|
||||
if iface.Up {
|
||||
state = "up"
|
||||
}
|
||||
if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
|
||||
}
|
||||
}
|
||||
|
||||
if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
if !errors.As(err, &exitErr) {
|
||||
errs = append(errs, fmt.Sprintf("clear default route: %v", err))
|
||||
}
|
||||
}
|
||||
for _, route := range snapshot.DefaultRoutes {
|
||||
fields := strings.Fields(route)
|
||||
if len(fields) == 0 {
|
||||
continue
|
||||
}
|
||||
// Strip state flags that ip-route(8) does not accept as add arguments.
|
||||
filtered := fields[:0]
|
||||
for _, f := range fields {
|
||||
switch f {
|
||||
case "linkdown", "dead", "onlink", "pervasive":
|
||||
// skip
|
||||
default:
|
||||
filtered = append(filtered, f)
|
||||
}
|
||||
}
|
||||
args := append([]string{"route", "add"}, filtered...)
|
||||
if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
|
||||
detail := strings.TrimSpace(string(raw))
|
||||
if detail != "" {
|
||||
errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
|
||||
} else {
|
||||
errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errors.New(strings.Join(errs, "; "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *System) DHCPOne(iface string) (string, error) {
|
||||
var out bytes.Buffer
|
||||
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
||||
@@ -131,6 +241,65 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
// SetInterfaceState brings a network interface up or down.
|
||||
func (s *System) SetInterfaceState(iface string, up bool) error {
|
||||
state := "down"
|
||||
if up {
|
||||
state = "up"
|
||||
}
|
||||
return exec.Command("ip", "link", "set", "dev", iface, state).Run()
|
||||
}
|
||||
|
||||
// GetInterfaceState returns true if the interface is UP.
|
||||
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
return interfaceAdminState(iface)
|
||||
}
|
||||
|
||||
func interfaceAdminState(iface string) (bool, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return parseInterfaceAdminState(string(raw))
|
||||
}
|
||||
|
||||
func parseInterfaceAdminState(raw string) (bool, error) {
|
||||
start := strings.IndexByte(raw, '<')
|
||||
if start == -1 {
|
||||
return false, fmt.Errorf("ip link output missing flags")
|
||||
}
|
||||
end := strings.IndexByte(raw[start+1:], '>')
|
||||
if end == -1 {
|
||||
return false, fmt.Errorf("ip link output missing flag terminator")
|
||||
}
|
||||
flags := strings.Split(raw[start+1:start+1+end], ",")
|
||||
for _, flag := range flags {
|
||||
if strings.TrimSpace(flag) == "UP" {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||
raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
if errors.As(err, &exitErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
var ipv4 []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 4 {
|
||||
ipv4 = append(ipv4, fields[3])
|
||||
}
|
||||
}
|
||||
return ipv4, nil
|
||||
}
|
||||
|
||||
func listInterfaceNames() ([]string, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
||||
if err != nil {
|
||||
|
||||
46
audit/internal/platform/network_test.go
Normal file
46
audit/internal/platform/network_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseInterfaceAdminState(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
raw string
|
||||
want bool
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "admin up with no carrier",
|
||||
raw: "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "admin down",
|
||||
raw: "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "malformed output",
|
||||
raw: "2: enp1s0: mtu 1500 state DOWN\n",
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := parseInterfaceAdminState(tt.raw)
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != tt.want {
|
||||
t.Fatalf("got %v want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
203
audit/internal/platform/nvidia_stress.go
Normal file
203
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,203 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
normalizeNvidiaStressOptions(&opts)
|
||||
|
||||
job, err := buildNvidiaStressJob(opts)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case NvidiaStressLoaderJohn:
|
||||
return "gpu-nvidia-john"
|
||||
case NvidiaStressLoaderNCCL:
|
||||
return "gpu-nvidia-nccl"
|
||||
default:
|
||||
return "gpu-nvidia-burn"
|
||||
}
|
||||
}
|
||||
|
||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return satJob{}, err
|
||||
}
|
||||
|
||||
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||
switch loader {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-gpu-burn.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderJohn:
|
||||
cmd := []string{
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-john-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderNCCL:
|
||||
cmd := []string{
|
||||
"bee-nccl-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-nccl-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
default:
|
||||
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
case NvidiaStressLoaderJohn:
|
||||
opts.Loader = NvidiaStressLoaderJohn
|
||||
case NvidiaStressLoaderNCCL:
|
||||
opts.Loader = NvidiaStressLoaderNCCL
|
||||
default:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
}
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
}
|
||||
|
||||
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
|
||||
selected := all
|
||||
if len(include) > 0 {
|
||||
want := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
want[idx] = struct{}{}
|
||||
}
|
||||
selected = selected[:0]
|
||||
for _, idx := range all {
|
||||
if _, ok := want[idx]; ok {
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||
}
|
||||
out := append([]int(nil), selected...)
|
||||
sort.Ints(out)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func listNvidiaGPUIndices() ([]int, error) {
|
||||
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var indices []int
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
return dedupeSortedIndices(indices), nil
|
||||
}
|
||||
|
||||
func dedupeSortedIndices(values []int) []int {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[int]struct{}, len(values))
|
||||
out := make([]int, 0, len(values))
|
||||
for _, value := range values {
|
||||
if value < 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func joinIndexList(values []int) string {
|
||||
parts := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
parts = append(parts, strconv.Itoa(value))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
545
audit/internal/platform/platform_stress.go
Normal file
545
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,545 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PlatformStressCycle defines one load+idle cycle.
|
||||
type PlatformStressCycle struct {
|
||||
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||
IdleSec int // seconds of idle monitoring after load cut
|
||||
}
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
type platformStressRow struct {
|
||||
ElapsedSec float64
|
||||
Cycle int
|
||||
Phase string // "load" | "idle"
|
||||
CPULoadPct float64
|
||||
MaxCPUTempC float64
|
||||
MaxGPUTempC float64
|
||||
SysPowerW float64
|
||||
FanMinRPM float64
|
||||
FanMaxRPM float64
|
||||
GPUThrottled bool
|
||||
}
|
||||
|
||||
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||
func (s *System) RunPlatformStress(
|
||||
ctx context.Context,
|
||||
baseDir string,
|
||||
opts PlatformStressOptions,
|
||||
logFunc func(string),
|
||||
) (string, error) {
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if len(opts.Cycles) == 0 {
|
||||
return "", fmt.Errorf("no cycles defined")
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
|
||||
stamp := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
|
||||
var analyses []cycleAnalysis
|
||||
|
||||
for i, cycle := range opts.Cycles {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
cycleNum := i + 1
|
||||
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||
|
||||
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
if hasCPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
}
|
||||
|
||||
// GPU stress
|
||||
if hasGPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
}
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
for _, r := range loadRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, loadRows...)
|
||||
loadCancel()
|
||||
wg.Wait()
|
||||
|
||||
if len(loadRows) > 0 {
|
||||
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||
}
|
||||
|
||||
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||
for _, r := range idleRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, idleRows...)
|
||||
idleCancel()
|
||||
|
||||
// Per-cycle analysis
|
||||
an := analyzePlatformCycle(loadRows, idleRows)
|
||||
analyses = append(analyses, an)
|
||||
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||
}
|
||||
|
||||
// Write CSV
|
||||
csvData := writePlatformCSV(rows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||
|
||||
// Write summary
|
||||
summary := writePlatformSummary(opts, analyses)
|
||||
logFunc("--- Summary ---")
|
||||
for _, line := range strings.Split(summary, "\n") {
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||
var rows []platformStressRow
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return rows
|
||||
case <-ticker.C:
|
||||
sample := SampleLiveMetrics()
|
||||
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||
r := platformStressRow{
|
||||
ElapsedSec: time.Since(testStart).Seconds(),
|
||||
Cycle: cycle,
|
||||
Phase: phase,
|
||||
CPULoadPct: s.CPULoadPct,
|
||||
SysPowerW: s.PowerW,
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
switch t.Group {
|
||||
case "cpu":
|
||||
if t.Celsius > r.MaxCPUTempC {
|
||||
r.MaxCPUTempC = t.Celsius
|
||||
}
|
||||
case "gpu":
|
||||
if t.Celsius > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = t.Celsius
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
if g.TempC > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = g.TempC
|
||||
}
|
||||
}
|
||||
if len(s.Fans) > 0 {
|
||||
r.FanMinRPM = s.Fans[0].RPM
|
||||
r.FanMaxRPM = s.Fans[0].RPM
|
||||
for _, f := range s.Fans[1:] {
|
||||
if f.RPM < r.FanMinRPM {
|
||||
r.FanMinRPM = f.RPM
|
||||
}
|
||||
if f.RPM > r.FanMaxRPM {
|
||||
r.FanMaxRPM = f.RPM
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func formatPlatformRow(r platformStressRow) string {
|
||||
throttle := ""
|
||||
if r.GPUThrottled {
|
||||
throttle = " THROTTLE"
|
||||
}
|
||||
fans := ""
|
||||
if r.FanMinRPM > 0 {
|
||||
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||
}
|
||||
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||
}
|
||||
|
||||
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||
var an cycleAnalysis
|
||||
for _, r := range loadRows {
|
||||
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||
an.maxCPUTemp = r.MaxCPUTempC
|
||||
}
|
||||
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||
an.maxGPUTemp = r.MaxGPUTempC
|
||||
}
|
||||
if r.SysPowerW > an.maxPower {
|
||||
an.maxPower = r.SysPowerW
|
||||
}
|
||||
if r.GPUThrottled {
|
||||
an.throttled = true
|
||||
}
|
||||
}
|
||||
// Fan RPM at cut = avg of last 5 load rows
|
||||
if n := len(loadRows); n > 0 {
|
||||
window := loadRows
|
||||
if n > 5 {
|
||||
window = loadRows[n-5:]
|
||||
}
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range window {
|
||||
if r.FanMinRPM > 0 {
|
||||
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt > 0 {
|
||||
an.fanAtCutAvg = sum / float64(cnt)
|
||||
}
|
||||
}
|
||||
// Fan RPM min in first 15s of idle
|
||||
an.fanMin15s = an.fanAtCutAvg
|
||||
var cutElapsed float64
|
||||
if len(loadRows) > 0 {
|
||||
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||
}
|
||||
for _, r := range idleRows {
|
||||
if r.ElapsedSec > cutElapsed+15 {
|
||||
break
|
||||
}
|
||||
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||
an.fanMin15s = avg
|
||||
}
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||
}
|
||||
return an
|
||||
}
|
||||
|
||||
type cycleAnalysis struct {
|
||||
maxCPUTemp float64
|
||||
maxGPUTemp float64
|
||||
maxPower float64
|
||||
throttled bool
|
||||
fanAtCutAvg float64
|
||||
fanMin15s float64
|
||||
fanDropPct float64
|
||||
}
|
||||
|
||||
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||
|
||||
totalThrottle := 0
|
||||
totalFanWarn := 0
|
||||
for i, an := range analyses {
|
||||
cycle := opts.Cycles[i]
|
||||
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||
if an.throttled {
|
||||
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||
totalThrottle++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Throttle: none\n")
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||
if an.fanDropPct > 20 {
|
||||
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||
totalFanWarn++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||
if totalThrottle > 0 {
|
||||
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||
} else if totalFanWarn > 0 {
|
||||
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||
} else {
|
||||
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||
var buf bytes.Buffer
|
||||
w := csv.NewWriter(&buf)
|
||||
_ = w.Write([]string{
|
||||
"elapsed_sec", "cycle", "phase",
|
||||
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||
})
|
||||
for _, r := range rows {
|
||||
throttled := "0"
|
||||
if r.GPUThrottled {
|
||||
throttled = "1"
|
||||
}
|
||||
_ = w.Write([]string{
|
||||
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||
strconv.Itoa(r.Cycle),
|
||||
r.Phase,
|
||||
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||
throttled,
|
||||
})
|
||||
}
|
||||
w.Flush()
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
path, err := satLookPath("stressapptest")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||
}
|
||||
// Use a very long duration; the context timeout will kill it at the right time.
|
||||
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||
if threads := platformStressCPUThreads(); threads > 0 {
|
||||
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||
}
|
||||
if mb := platformStressMemoryMB(); mb > 0 {
|
||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||
}
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := `actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: 86400000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func platformStressCPUThreads() int {
|
||||
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||
return n
|
||||
}
|
||||
cpus := runtime.NumCPU()
|
||||
switch {
|
||||
case cpus <= 2:
|
||||
return 1
|
||||
case cpus <= 8:
|
||||
return cpus - 1
|
||||
default:
|
||||
return cpus - 2
|
||||
}
|
||||
}
|
||||
|
||||
func platformStressMemoryMB() int {
|
||||
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||
return mb
|
||||
}
|
||||
free := freeMemBytes()
|
||||
if free <= 0 {
|
||||
return 0
|
||||
}
|
||||
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||
if mb < 1024 {
|
||||
return 1024
|
||||
}
|
||||
return mb
|
||||
}
|
||||
|
||||
func containsComponent(components []string, name string) bool {
|
||||
for _, c := range components {
|
||||
if c == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
gz := gzip.NewWriter(f)
|
||||
defer gz.Close()
|
||||
tw := tar.NewWriter(gz)
|
||||
defer tw.Close()
|
||||
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(dir)
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
fpath := filepath.Join(dir, e.Name())
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
hdr := &tar.Header{
|
||||
Name: filepath.Join(base, e.Name()),
|
||||
Size: int64(len(data)),
|
||||
Mode: 0644,
|
||||
ModTime: time.Now(),
|
||||
}
|
||||
if err := tw.WriteHeader(hdr); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := tw.Write(data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||
if got := platformStressCPUThreads(); got != 7 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||
got := platformStressCPUThreads()
|
||||
if got < 1 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||
}
|
||||
if got > runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||
if got := platformStressMemoryMB(); got != 8192 {
|
||||
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||
}
|
||||
}
|
||||
@@ -16,9 +16,6 @@ var runtimeRequiredTools = []string{
|
||||
"smartctl",
|
||||
"nvme",
|
||||
"ipmitool",
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-stress",
|
||||
"dhclient",
|
||||
"mount",
|
||||
}
|
||||
@@ -93,7 +90,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
}
|
||||
}
|
||||
|
||||
for _, tool := range s.CheckTools(runtimeRequiredTools) {
|
||||
vendor := s.DetectGPUVendor()
|
||||
for _, tool := range s.runtimeToolStatuses(vendor) {
|
||||
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
||||
Name: tool.Name,
|
||||
Path: tool.Path,
|
||||
@@ -115,39 +113,7 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
})
|
||||
}
|
||||
|
||||
lsmodText := commandText("lsmod")
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA kernel module is not loaded.",
|
||||
})
|
||||
}
|
||||
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_modeset_failed",
|
||||
Severity: "warning",
|
||||
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||
})
|
||||
}
|
||||
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
health.CUDAReady = false
|
||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "cuda_runtime_not_ready",
|
||||
Severity: "warning",
|
||||
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
@@ -162,3 +128,90 @@ func commandText(name string, args ...string) string {
|
||||
}
|
||||
return string(raw)
|
||||
}
|
||||
|
||||
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
tools := s.CheckTools(runtimeRequiredTools)
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
"bee-nccl-gpu-stress",
|
||||
"all_reduce_perf",
|
||||
})...)
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||
tool.Path = cmd[0]
|
||||
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
|
||||
tool.Path = cmd[1]
|
||||
}
|
||||
tool.OK = true
|
||||
}
|
||||
tools = append(tools, tool)
|
||||
}
|
||||
return tools
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA kernel module is not loaded.",
|
||||
})
|
||||
}
|
||||
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_modeset_failed",
|
||||
Severity: "warning",
|
||||
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||
})
|
||||
}
|
||||
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "cuda_runtime_not_ready",
|
||||
Severity: "warning",
|
||||
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
case "amd":
|
||||
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "amdgpu_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "AMD GPU driver is not loaded.",
|
||||
})
|
||||
}
|
||||
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.CUDAReady = true
|
||||
health.DriverReady = true
|
||||
return
|
||||
}
|
||||
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "rocm_smi_unavailable",
|
||||
Severity: "warning",
|
||||
Description: "ROCm SMI is not available for AMD GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,19 +2,78 @@ package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bufio"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
|
||||
rocmSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
}
|
||||
rocmSMIScriptGlobs = []string{
|
||||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||
}
|
||||
rvsExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rvs",
|
||||
"/opt/rocm-*/bin/rvs",
|
||||
}
|
||||
)
|
||||
|
||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||
// Returns combined stdout+stderr as a byte slice.
|
||||
func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||
pr, pw := io.Pipe()
|
||||
cmd.Stdout = pw
|
||||
cmd.Stderr = pw
|
||||
|
||||
var buf bytes.Buffer
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
scanner := bufio.NewScanner(pr)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
buf.WriteString(line + "\n")
|
||||
if logFunc != nil {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
err := cmd.Start()
|
||||
if err != nil {
|
||||
_ = pw.Close()
|
||||
wg.Wait()
|
||||
return nil, err
|
||||
}
|
||||
waitErr := cmd.Wait()
|
||||
_ = pw.Close()
|
||||
wg.Wait()
|
||||
return buf.Bytes(), waitErr
|
||||
}
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
@@ -36,12 +95,18 @@ func (s *System) DetectGPUVendor() string {
|
||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||
return "amd"
|
||||
}
|
||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||
text := strings.ToLower(string(raw))
|
||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||
return "amd"
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
||||
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||
out, err := exec.Command("rocm-smi", "--showproductname", "--csv").Output()
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("rocm-smi: %w", err)
|
||||
}
|
||||
@@ -63,13 +128,103 @@ func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||
}
|
||||
|
||||
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||||
func (s *System) RunAMDAcceptancePack(baseDir string) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-amd", []satJob{
|
||||
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||||
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||
cfg := `actions:
|
||||
- name: mem_integrity
|
||||
device: all
|
||||
module: mem
|
||||
parallel: true
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size: 8640
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||
cfg := `actions:
|
||||
- name: babel_mem_bw
|
||||
device: all
|
||||
module: babel
|
||||
parallel: true
|
||||
copy_matrix: true
|
||||
target_stress: 90
|
||||
matrix_size: 134217728
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
seconds := durationSec
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
|
||||
}
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||
rvsCfg := amdStressRVSConfig(seconds)
|
||||
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||
}
|
||||
|
||||
func amdStressRVSConfig(seconds int) string {
|
||||
return fmt.Sprintf(`actions:
|
||||
- name: gst_stress
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: %d
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`, seconds*1000)
|
||||
}
|
||||
|
||||
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||
return []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||
}
|
||||
}
|
||||
|
||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||
@@ -104,39 +259,116 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA SAT with explicit duration,
|
||||
// GPU memory size, and GPU index selection. ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, durationSec int, sizeMB int, gpuIndices []int) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaSATJobsWithOptions(durationSec, sizeMB, gpuIndices))
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||
// ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||
}
|
||||
|
||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
if len(gpuIndices) > 0 {
|
||||
return dedupeSortedIndices(gpuIndices), nil
|
||||
}
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
return runAcceptancePack(baseDir, "memory", []satJob{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunCPUAcceptancePack(baseDir string, durationSec int) (string, error) {
|
||||
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
seconds := durationSec
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||
}
|
||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||
sizeArg := "80%"
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
sizeArg = fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||
"stress-ng", "--vm", "1",
|
||||
"--vm-bytes", sizeArg,
|
||||
"--vm-method", "all",
|
||||
"--timeout", fmt.Sprintf("%d", seconds),
|
||||
"--metrics-brief",
|
||||
}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
seconds := durationSec
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
|
||||
}
|
||||
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
||||
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
||||
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stressapptest.log", cmd: cmd},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if durationSec <= 0 {
|
||||
durationSec = 60
|
||||
}
|
||||
return runAcceptancePack(baseDir, "cpu", []satJob{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
|
||||
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
||||
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||||
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||||
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -164,11 +396,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
}
|
||||
|
||||
for index, devPath := range devices {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath)
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||
out, err := runSATCommand(verboseLog, job.name, job.cmd)
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -206,83 +444,39 @@ type satStats struct {
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
if diagLevel < 1 || diagLevel > 4 {
|
||||
diagLevel = 3
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, prefix+"-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
for _, arg := range job.cmd {
|
||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||
}
|
||||
out, err := runSATCommand(verboseLog, job.name, cmd)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
}
|
||||
writeSATStats(&summary, stats)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func nvidiaSATJobsWithOptions(durationSec, sizeMB int, gpuIndices []int) []satJob {
|
||||
var env []string
|
||||
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{
|
||||
name: "05-bee-gpu-stress.log",
|
||||
cmd: []string{"bee-gpu-stress", "--seconds", strconv.Itoa(durationSec), "--size-mb", strconv.Itoa(sizeMB)},
|
||||
env: env,
|
||||
collectGPU: true,
|
||||
gpuIndices: gpuIndices,
|
||||
},
|
||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -309,9 +503,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
var err error
|
||||
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir)
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env)
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
}
|
||||
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
@@ -335,18 +529,38 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string) ([]byte, error) {
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(cmd, " "),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||
}
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, cmd[0], cmd[1:]...)
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
c.Cancel = func() error {
|
||||
if c.Process != nil {
|
||||
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
out, err := c.CombinedOutput()
|
||||
out, err := streamExecOutput(c, logFunc)
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
@@ -362,19 +576,11 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) != 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices, nil
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
@@ -429,6 +635,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
}
|
||||
|
||||
text := strings.ToLower(string(out))
|
||||
// No output at all means the tool failed to start (mlock limit, binary missing,
|
||||
// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
|
||||
if len(strings.TrimSpace(text)) == 0 {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
if strings.Contains(text, "unsupported") ||
|
||||
strings.Contains(text, "not supported") ||
|
||||
strings.Contains(text, "invalid opcode") ||
|
||||
@@ -437,20 +648,36 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
strings.Contains(text, "not available") ||
|
||||
strings.Contains(text, "cuda_error_system_not_ready") ||
|
||||
strings.Contains(text, "no such device") ||
|
||||
// nvidia-smi on a machine with no NVIDIA GPU
|
||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||
strings.Contains(text, "no nvidia gpu") ||
|
||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
return "FAILED", rc
|
||||
}
|
||||
|
||||
func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(cmd, " "),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||
}
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
@@ -465,9 +692,125 @@ func runSATCommand(verboseLog, name string, cmd []string) ([]byte, error) {
|
||||
return out, err
|
||||
}
|
||||
|
||||
func runROCmSMI(args ...string) ([]byte, error) {
|
||||
cmd, err := resolveROCmSMICommand(args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
}
|
||||
|
||||
func resolveSATCommand(cmd []string) ([]string, error) {
|
||||
if len(cmd) == 0 {
|
||||
return nil, errors.New("empty SAT command")
|
||||
}
|
||||
switch cmd[0] {
|
||||
case "rocm-smi":
|
||||
return resolveROCmSMICommand(cmd[1:]...)
|
||||
case "rvs":
|
||||
return resolveRVSCommand(cmd[1:]...)
|
||||
}
|
||||
path, err := satLookPath(cmd[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||||
}
|
||||
return append([]string{path}, cmd[1:]...), nil
|
||||
}
|
||||
|
||||
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||
if path, err := satLookPath("rvs"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
for _, path := range expandExistingPaths(rvsExecutableGlobs) {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
return nil, errors.New("rvs not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||
if path, err := satLookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
for _, path := range rocmSMIExecutableCandidates() {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
pythonPath, pyErr := satLookPath("python3")
|
||||
if pyErr == nil {
|
||||
for _, script := range rocmSMIScriptCandidates() {
|
||||
cmd := []string{pythonPath, script}
|
||||
cmd = append(cmd, args...)
|
||||
return cmd, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func ensureAMDRuntimeReady() error {
|
||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||
return nil
|
||||
}
|
||||
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
|
||||
state := strings.TrimSpace(string(raw))
|
||||
if strings.EqualFold(state, "live") {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
|
||||
}
|
||||
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
|
||||
}
|
||||
|
||||
func rocmSMIExecutableCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||
}
|
||||
|
||||
func rocmSMIScriptCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIScriptGlobs)
|
||||
}
|
||||
|
||||
func expandExistingPaths(patterns []string) []string {
|
||||
seen := make(map[string]struct{})
|
||||
var paths []string
|
||||
for _, pattern := range patterns {
|
||||
matches, err := satGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
if _, err := satStat(match); err != nil {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[match]; ok {
|
||||
continue
|
||||
}
|
||||
seen[match] = struct{}{}
|
||||
paths = append(paths, match)
|
||||
}
|
||||
}
|
||||
return paths
|
||||
}
|
||||
|
||||
func parseStorageDevices(raw string) []string {
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) < 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string) ([]byte, error) {
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
var metricRows []GPUMetricRow
|
||||
@@ -495,7 +838,7 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
||||
}
|
||||
}()
|
||||
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env)
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
||||
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
720
audit/internal/platform/sat_fan_stress.go
Normal file
720
audit/internal/platform/sat_fan_stress.go
Normal file
@@ -0,0 +1,720 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FanStressOptions configures the fan-stress / thermal cycling test.
|
||||
type FanStressOptions struct {
|
||||
BaselineSec int // idle monitoring before and after load (default 30)
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
// FanReading holds one fan sensor reading.
|
||||
type FanReading struct {
|
||||
Name string
|
||||
RPM float64
|
||||
}
|
||||
|
||||
// GPUStressMetric holds per-GPU metrics during the stress test.
|
||||
type GPUStressMetric struct {
|
||||
Index int
|
||||
TempC float64
|
||||
UsagePct float64
|
||||
PowerW float64
|
||||
ClockMHz float64
|
||||
Throttled bool // true if any throttle reason is active
|
||||
}
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
applyFanStressDefaults(&opts)
|
||||
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "fan-stress-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
// Phase name shared between sampler goroutine and main goroutine.
|
||||
var phaseMu sync.Mutex
|
||||
currentPhase := "init"
|
||||
setPhase := func(name string) {
|
||||
phaseMu.Lock()
|
||||
currentPhase = name
|
||||
phaseMu.Unlock()
|
||||
}
|
||||
getPhase := func() string {
|
||||
phaseMu.Lock()
|
||||
defer phaseMu.Unlock()
|
||||
return currentPhase
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
var rowsMu sync.Mutex
|
||||
var allRows []FanStressRow
|
||||
|
||||
// Start background sampler (every second).
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
go func() {
|
||||
defer close(doneCh)
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
|
||||
rowsMu.Lock()
|
||||
allRows = append(allRows, row)
|
||||
rowsMu.Unlock()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
var summary strings.Builder
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
|
||||
stats := satStats{}
|
||||
|
||||
// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
|
||||
idlePhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
|
||||
)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durSec) * time.Second):
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
|
||||
)
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
|
||||
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(durSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||
stats.Failed++
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
}
|
||||
|
||||
// Execute test phases.
|
||||
idlePhase("baseline", "01-baseline", opts.BaselineSec)
|
||||
loadPhase("load1", "02-load1", opts.Phase1DurSec)
|
||||
idlePhase("pause", "03-pause", opts.PauseSec)
|
||||
loadPhase("load2", "04-load2", opts.Phase2DurSec)
|
||||
idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
|
||||
|
||||
// Stop sampler and collect rows.
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
rowsMu.Lock()
|
||||
rows := allRows
|
||||
rowsMu.Unlock()
|
||||
|
||||
// Analysis.
|
||||
throttled := analyzeThrottling(rows)
|
||||
maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
var m float64
|
||||
for _, g := range r.GPUs {
|
||||
if g.TempC > m {
|
||||
m = g.TempC
|
||||
}
|
||||
}
|
||||
return m
|
||||
})
|
||||
maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
return r.CPUMaxTempC
|
||||
})
|
||||
fanResponseSec := analyzeFanResponse(rows)
|
||||
|
||||
fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
|
||||
fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
|
||||
fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
|
||||
if fanResponseSec >= 0 {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
|
||||
}
|
||||
|
||||
// Throttling failure counts against overall result.
|
||||
if throttled {
|
||||
stats.Failed++
|
||||
}
|
||||
writeSATStats(&summary, stats)
|
||||
|
||||
// Write CSV outputs.
|
||||
if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
|
||||
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.BaselineSec <= 0 {
|
||||
opts.BaselineSec = 30
|
||||
}
|
||||
if opts.Phase1DurSec <= 0 {
|
||||
opts.Phase1DurSec = 300
|
||||
}
|
||||
if opts.PauseSec <= 0 {
|
||||
opts.PauseSec = 60
|
||||
}
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
|
||||
row := FanStressRow{
|
||||
TimestampUTC: time.Now().UTC().Format(time.RFC3339),
|
||||
ElapsedSec: elapsed,
|
||||
Phase: phase,
|
||||
}
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
return row
|
||||
}
|
||||
|
||||
// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
|
||||
// clock frequency, and active throttle reasons for each GPU.
|
||||
func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||
}
|
||||
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var metrics []GPUStressMetric
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
throttleVal := strings.TrimSpace(parts[5])
|
||||
// Throttled if active reasons bitmask is non-zero.
|
||||
throttled := throttleVal != "0x0000000000000000" &&
|
||||
throttleVal != "0x0" &&
|
||||
throttleVal != "0" &&
|
||||
throttleVal != "" &&
|
||||
throttleVal != "N/A"
|
||||
metrics = append(metrics, GPUStressMetric{
|
||||
Index: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
PowerW: parseGPUFloat(parts[3]),
|
||||
ClockMHz: parseGPUFloat(parts[4]),
|
||||
Throttled: throttled,
|
||||
})
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||
func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||
func parseFanSpeeds(raw string) []FanReading {
|
||||
var fans []FanReading
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
// Find the first field that contains "RPM" (either as a standalone unit or inline)
|
||||
rpmVal := 0.0
|
||||
found := false
|
||||
for _, p := range parts[1:] {
|
||||
p = strings.TrimSpace(p)
|
||||
if !strings.Contains(strings.ToUpper(p), "RPM") {
|
||||
continue
|
||||
}
|
||||
if strings.EqualFold(p, "RPM") {
|
||||
continue // unit-only column in old format; value is in previous field
|
||||
}
|
||||
val, err := parseFanRPMValue(p)
|
||||
if err == nil {
|
||||
rpmVal = val
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// Old format: unit "RPM" is in col[2], value is in col[1]
|
||||
if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
|
||||
if val, err := parseFanRPMValue(valStr); err == nil {
|
||||
rpmVal = val
|
||||
found = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
fans = append(fans, FanReading{Name: name, RPM: rpmVal})
|
||||
}
|
||||
return fans
|
||||
}
|
||||
|
||||
func parseFanRPMValue(raw string) (float64, error) {
|
||||
fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
|
||||
if len(fields) == 0 {
|
||||
return 0, strconv.ErrSyntax
|
||||
}
|
||||
return strconv.ParseFloat(fields[0], 64)
|
||||
}
|
||||
|
||||
func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil, err
|
||||
}
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(out, &doc); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
chips := make([]string, 0, len(doc))
|
||||
for chip := range doc {
|
||||
chips = append(chips, chip)
|
||||
}
|
||||
sort.Strings(chips)
|
||||
var fans []FanReading
|
||||
seen := map[string]struct{}{}
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
names := make([]string, 0, len(features))
|
||||
for name := range features {
|
||||
names = append(names, name)
|
||||
}
|
||||
sort.Strings(names)
|
||||
for _, name := range names {
|
||||
feature, ok := features[name].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
rpm, ok := firstFanInputValue(feature)
|
||||
if !ok || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
label := strings.TrimSpace(name)
|
||||
if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
|
||||
label = chip + " / " + label
|
||||
}
|
||||
if _, ok := seen[label]; ok {
|
||||
continue
|
||||
}
|
||||
seen[label] = struct{}{}
|
||||
fans = append(fans, FanReading{Name: label, RPM: rpm})
|
||||
}
|
||||
}
|
||||
return fans, nil
|
||||
}
|
||||
|
||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
|
||||
continue
|
||||
}
|
||||
switch value := feature[key].(type) {
|
||||
case float64:
|
||||
return value, true
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(value, 64)
|
||||
if err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||
func sampleCPUMaxTemp() float64 {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||
if err != nil {
|
||||
return sampleCPUTempViaSensors()
|
||||
}
|
||||
return parseIPMIMaxTemp(string(out))
|
||||
}
|
||||
|
||||
// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
|
||||
func parseIPMIMaxTemp(raw string) float64 {
|
||||
var max float64
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
if !strings.Contains(strings.ToLower(unit), "degrees") {
|
||||
continue
|
||||
}
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if strings.EqualFold(valStr, "na") || valStr == "" {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(valStr, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
|
||||
func sampleCPUTempViaSensors() float64 {
|
||||
out, err := exec.Command("sensors", "-u").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var max float64
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
if !strings.HasSuffix(fields[0], "_input:") {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > 0 && val < 150 && val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
now := time.Now()
|
||||
current := 0.0
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err == nil {
|
||||
current = parseDCMIPowerReading(string(out))
|
||||
}
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||
systemPowerCache = updated
|
||||
return value
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
// Sample: " Instantaneous power reading: 500 Watts"
|
||||
func parseDCMIPowerReading(raw string) float64 {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if !strings.Contains(strings.ToLower(line), "instantaneous") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
for i, p := range parts {
|
||||
if strings.EqualFold(p, "Watts") && i > 0 {
|
||||
val, err := strconv.ParseFloat(parts[i-1], 64)
|
||||
if err == nil {
|
||||
return val
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
return cache.Value, cache
|
||||
}
|
||||
return 0, cache
|
||||
}
|
||||
|
||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||
// during either load phase.
|
||||
func analyzeThrottling(rows []FanStressRow) bool {
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" && row.Phase != "load2" {
|
||||
continue
|
||||
}
|
||||
for _, gpu := range row.GPUs {
|
||||
if gpu.Throttled {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
|
||||
func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
|
||||
var max float64
|
||||
for _, row := range rows {
|
||||
if v := extract(row); v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// analyzeFanResponse returns the seconds from load1 start until fan RPM first
|
||||
// increased by more than 5% above the baseline average. Returns -1 if undetermined.
|
||||
func analyzeFanResponse(rows []FanStressRow) float64 {
|
||||
// Compute baseline average fan RPM.
|
||||
var baseTotal, baseCount float64
|
||||
for _, row := range rows {
|
||||
if row.Phase != "baseline" {
|
||||
continue
|
||||
}
|
||||
for _, f := range row.Fans {
|
||||
baseTotal += f.RPM
|
||||
baseCount++
|
||||
}
|
||||
}
|
||||
if baseCount == 0 || baseTotal == 0 {
|
||||
return -1
|
||||
}
|
||||
baseAvg := baseTotal / baseCount
|
||||
threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
|
||||
|
||||
// Find elapsed time when load1 started.
|
||||
var load1Start float64 = -1
|
||||
for _, row := range rows {
|
||||
if row.Phase == "load1" {
|
||||
load1Start = row.ElapsedSec
|
||||
break
|
||||
}
|
||||
}
|
||||
if load1Start < 0 {
|
||||
return -1
|
||||
}
|
||||
|
||||
// Find first load1 row where average RPM crosses the threshold.
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" {
|
||||
continue
|
||||
}
|
||||
var total, count float64
|
||||
for _, f := range row.Fans {
|
||||
total += f.RPM
|
||||
count++
|
||||
}
|
||||
if count > 0 && total/count >= threshold {
|
||||
return row.ElapsedSec - load1Start
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
|
||||
// GPU columns are generated per index in gpuIndices order.
|
||||
func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
|
||||
if len(rows) == 0 {
|
||||
return os.WriteFile(path, []byte("no data\n"), 0644)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header: fixed system columns + per-GPU columns.
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
|
||||
for _, idx := range gpuIndices {
|
||||
fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
|
||||
idx, idx, idx, idx, idx)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
|
||||
for _, row := range rows {
|
||||
favg, fmin, fmax := fanRPMStats(row.Fans)
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
|
||||
row.TimestampUTC,
|
||||
row.ElapsedSec,
|
||||
row.Phase,
|
||||
favg, fmin, fmax,
|
||||
row.CPUMaxTempC,
|
||||
row.SysPowerW,
|
||||
)
|
||||
gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
|
||||
for _, g := range row.GPUs {
|
||||
gpuByIdx[g.Index] = g
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
g := gpuByIdx[idx]
|
||||
throttled := 0
|
||||
if g.Throttled {
|
||||
throttled = 1
|
||||
}
|
||||
fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
|
||||
g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
|
||||
func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
|
||||
var b strings.Builder
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
|
||||
for _, row := range rows {
|
||||
for _, f := range row.Fans {
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
|
||||
row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
|
||||
}
|
||||
}
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// fanRPMStats computes average, min, max RPM across all fans in a sample row.
|
||||
func fanRPMStats(fans []FanReading) (avg, min, max float64) {
|
||||
if len(fans) == 0 {
|
||||
return 0, 0, 0
|
||||
}
|
||||
min = fans[0].RPM
|
||||
max = fans[0].RPM
|
||||
var total float64
|
||||
for _, f := range fans {
|
||||
total += f.RPM
|
||||
if f.RPM < min {
|
||||
min = f.RPM
|
||||
}
|
||||
if f.RPM > max {
|
||||
max = f.RPM
|
||||
}
|
||||
}
|
||||
return total / float64(len(fans)), min, max
|
||||
}
|
||||
67
audit/internal/platform/sat_fan_stress_test.go
Normal file
67
audit/internal/platform/sat_fan_stress_test.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParseFanSpeeds(t *testing.T) {
|
||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||
got := parseFanSpeeds(raw)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("fans=%d want 2 (%v)", len(got), got)
|
||||
}
|
||||
if got[0].Name != "FAN1" || got[0].RPM != 2400 {
|
||||
t.Fatalf("fan0=%+v", got[0])
|
||||
}
|
||||
if got[1].Name != "FAN2" || got[1].RPM != 1800 {
|
||||
t.Fatalf("fan1=%+v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFirstFanInputValue(t *testing.T) {
|
||||
feature := map[string]any{
|
||||
"fan1_input": 9200.0,
|
||||
}
|
||||
got, ok := firstFanInputValue(feature)
|
||||
if !ok || got != 9200 {
|
||||
t.Fatalf("got=%v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
Minimum during sampling period: 498 Watts
|
||||
`
|
||||
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
if updated.Value != 480 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
if updated.Value != 530 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,9 @@ package platform
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -28,21 +31,59 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := amdStressRVSConfig(123)
|
||||
if !strings.Contains(cfg, "module: gst") {
|
||||
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||
}
|
||||
if strings.Contains(cfg, "module: mem") {
|
||||
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||
}
|
||||
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||
}
|
||||
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||
if !strings.Contains(cfg, field) {
|
||||
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||
if len(jobs) != 4 {
|
||||
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||
}
|
||||
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||
}
|
||||
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||
}
|
||||
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
}
|
||||
@@ -53,6 +94,126 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 600,
|
||||
Loader: NvidiaStressLoaderJohn,
|
||||
ExcludeGPUIndices: []int{1},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 120,
|
||||
Loader: NvidiaStressLoaderNCCL,
|
||||
GPUIndices: []int{2, 0},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
got, err := resolveDCGMGPUIndices(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "0,1,2"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "1,3"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||
{loader: "", want: "gpu-nvidia-burn"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvIntFallback(t *testing.T) {
|
||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
@@ -78,8 +239,8 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -91,3 +252,128 @@ func TestClassifySATResult(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
|
||||
got := parseStorageDevices(raw)
|
||||
want := []string{"/dev/nvme0n1", "/dev/sdb"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||
t.Setenv("PATH", t.TempDir())
|
||||
|
||||
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
|
||||
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showproductname")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != toolPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "stress-ng" {
|
||||
return "/usr/bin/stress-ng", nil
|
||||
}
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveSATCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 3 {
|
||||
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/stress-ng" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||
t.Fatalf("error=%q", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
|
||||
t.Fatalf("mkdir: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
oldGlob := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
rocmSMIExecutableGlobs = []string{execPath}
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
rocmSMIExecutableGlobs = oldGlob
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
t.Setenv("PATH", "")
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showallinfo")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != execPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
oldExecGlobs := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
|
||||
rocmSMIExecutableGlobs = nil
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
satLookPath = oldLookPath
|
||||
rocmSMIExecutableGlobs = oldExecGlobs
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
if _, err := runROCmSMI("--showproductname"); err == nil {
|
||||
t.Fatal("expected missing rocm-smi error")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,10 @@ func (s *System) ListBeeServices() ([]string, error) {
|
||||
}
|
||||
for _, match := range matches {
|
||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||
if strings.HasSuffix(name, "@") {
|
||||
continue
|
||||
}
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
out = append(out, name)
|
||||
|
||||
@@ -24,15 +24,23 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
var techDumpNvidiaCommands = []struct {
|
||||
Name string
|
||||
Args []string
|
||||
File string
|
||||
}{
|
||||
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
type lsblkDumpRoot struct {
|
||||
Blockdevices []struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Tran string `json:"tran"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
|
||||
@@ -50,6 +58,15 @@ func (s *System) CaptureTechnicalDump(baseDir string) error {
|
||||
for _, cmd := range techDumpFixedCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
switch s.DetectGPUVendor() {
|
||||
case "nvidia":
|
||||
for _, cmd := range techDumpNvidiaCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
case "amd":
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
|
||||
}
|
||||
|
||||
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
|
||||
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
|
||||
@@ -69,6 +86,14 @@ func writeCommandDump(path, name string, args ...string) {
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func writeROCmSMIDump(path string, args ...string) {
|
||||
out, err := runROCmSMI(args...)
|
||||
if err != nil && len(out) == 0 {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func lsblkDumpDevices(path string) []string {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
@@ -80,6 +105,9 @@ func lsblkDumpDevices(path string) []string {
|
||||
}
|
||||
var devices []string
|
||||
for _, dev := range root.Blockdevices {
|
||||
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
|
||||
continue
|
||||
}
|
||||
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
|
||||
devices = append(devices, strings.TrimSpace(dev.Name))
|
||||
}
|
||||
|
||||
@@ -12,12 +12,12 @@ func TestLSBLKDumpDevices(t *testing.T) {
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "lsblk.json")
|
||||
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk"}]}`), 0644); err != nil {
|
||||
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write lsblk fixture: %v", err)
|
||||
}
|
||||
|
||||
got := lsblkDumpDevices(path)
|
||||
want := []string{"nvme0n1", "sda"}
|
||||
want := []string{"nvme0n1", "sdb"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
|
||||
}
|
||||
|
||||
@@ -2,12 +2,31 @@ package platform
|
||||
|
||||
type System struct{}
|
||||
|
||||
type LiveBootSource struct {
|
||||
InRAM bool `json:"in_ram"`
|
||||
Kind string `json:"kind"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
IPv4 []string
|
||||
}
|
||||
|
||||
type NetworkInterfaceSnapshot struct {
|
||||
Name string
|
||||
Up bool
|
||||
IPv4 []string
|
||||
}
|
||||
|
||||
type NetworkSnapshot struct {
|
||||
Interfaces []NetworkInterfaceSnapshot
|
||||
DefaultRoutes []string
|
||||
ResolvConf string
|
||||
}
|
||||
|
||||
type ServiceAction string
|
||||
|
||||
const (
|
||||
@@ -39,6 +58,20 @@ type ToolStatus struct {
|
||||
OK bool
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaStressLoaderBuiltin = "builtin"
|
||||
NvidiaStressLoaderJohn = "john"
|
||||
NvidiaStressLoaderNCCL = "nccl"
|
||||
)
|
||||
|
||||
type NvidiaStressOptions struct {
|
||||
DurationSec int
|
||||
SizeMB int
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
return &System{}
|
||||
}
|
||||
|
||||
@@ -1,156 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) updateStaticForm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "esc":
|
||||
m.screen = screenNetwork
|
||||
m.formFields = nil
|
||||
m.formIndex = 0
|
||||
return m, nil
|
||||
case "up", "shift+tab":
|
||||
if m.formIndex > 0 {
|
||||
m.formIndex--
|
||||
}
|
||||
case "down", "tab":
|
||||
if m.formIndex < len(m.formFields)-1 {
|
||||
m.formIndex++
|
||||
}
|
||||
case "enter":
|
||||
if m.formIndex < len(m.formFields)-1 {
|
||||
m.formIndex++
|
||||
return m, nil
|
||||
}
|
||||
cfg := m.app.ParseStaticIPv4Config(m.selectedIface, []string{
|
||||
m.formFields[0].Value,
|
||||
m.formFields[1].Value,
|
||||
m.formFields[2].Value,
|
||||
m.formFields[3].Value,
|
||||
})
|
||||
m.busy = true
|
||||
m.busyTitle = "Static IPv4: " + m.selectedIface
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.SetStaticIPv4Result(cfg)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case "backspace":
|
||||
field := &m.formFields[m.formIndex]
|
||||
if len(field.Value) > 0 {
|
||||
field.Value = field.Value[:len(field.Value)-1]
|
||||
}
|
||||
default:
|
||||
if msg.Type == tea.KeyRunes && len(msg.Runes) > 0 {
|
||||
m.formFields[m.formIndex].Value += string(msg.Runes)
|
||||
}
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "left", "up", "tab":
|
||||
if m.cursor > 0 {
|
||||
m.cursor--
|
||||
}
|
||||
case "right", "down":
|
||||
if m.cursor < 1 {
|
||||
m.cursor++
|
||||
}
|
||||
case "esc":
|
||||
m.screen = m.confirmCancelTarget()
|
||||
m.cursor = 0
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
case "enter":
|
||||
if m.cursor == 1 { // Cancel
|
||||
m.screen = m.confirmCancelTarget()
|
||||
m.cursor = 0
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
}
|
||||
m.busy = true
|
||||
switch m.pendingAction {
|
||||
case actionExportBundle:
|
||||
m.busyTitle = "Export support bundle"
|
||||
target := *m.selectedTarget
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.ExportSupportBundleResult(target)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
|
||||
}
|
||||
case actionRunAll:
|
||||
return m.executeRunAll()
|
||||
case actionRunMemorySAT:
|
||||
m.busyTitle = "Memory test"
|
||||
m.progressPrefix = "memory"
|
||||
m.progressSince = time.Now()
|
||||
m.progressLines = nil
|
||||
since := m.progressSince
|
||||
return m, tea.Batch(
|
||||
func() tea.Msg {
|
||||
result, err := m.app.RunMemoryAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
||||
},
|
||||
pollSATProgress("memory", since),
|
||||
)
|
||||
case actionRunStorageSAT:
|
||||
m.busyTitle = "Storage test"
|
||||
m.progressPrefix = "storage"
|
||||
m.progressSince = time.Now()
|
||||
m.progressLines = nil
|
||||
since := m.progressSince
|
||||
return m, tea.Batch(
|
||||
func() tea.Msg {
|
||||
result, err := m.app.RunStorageAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
||||
},
|
||||
pollSATProgress("storage", since),
|
||||
)
|
||||
case actionRunCPUSAT:
|
||||
m.busyTitle = "CPU test"
|
||||
m.progressPrefix = "cpu"
|
||||
m.progressSince = time.Now()
|
||||
m.progressLines = nil
|
||||
since := m.progressSince
|
||||
durationSec := hcCPUDurations[m.hcMode]
|
||||
return m, tea.Batch(
|
||||
func() tea.Msg {
|
||||
result, err := m.app.RunCPUAcceptancePackResult("", durationSec)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
||||
},
|
||||
pollSATProgress("cpu", since),
|
||||
)
|
||||
case actionRunAMDGPUSAT:
|
||||
m.busyTitle = "AMD GPU test"
|
||||
m.progressPrefix = "gpu-amd"
|
||||
m.progressSince = time.Now()
|
||||
m.progressLines = nil
|
||||
since := m.progressSince
|
||||
return m, tea.Batch(
|
||||
func() tea.Msg {
|
||||
result, err := m.app.RunAMDAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenHealthCheck}
|
||||
},
|
||||
pollSATProgress("gpu-amd", since),
|
||||
)
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) confirmCancelTarget() screen {
|
||||
switch m.pendingAction {
|
||||
case actionExportBundle:
|
||||
return screenExportTargets
|
||||
case actionRunAll, actionRunMemorySAT, actionRunStorageSAT, actionRunCPUSAT, actionRunAMDGPUSAT:
|
||||
return screenHealthCheck
|
||||
default:
|
||||
return screenMain
|
||||
}
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type resultMsg struct {
|
||||
title string
|
||||
body string
|
||||
err error
|
||||
back screen
|
||||
}
|
||||
|
||||
type servicesMsg struct {
|
||||
services []string
|
||||
err error
|
||||
}
|
||||
|
||||
type interfacesMsg struct {
|
||||
ifaces []platform.InterfaceInfo
|
||||
err error
|
||||
}
|
||||
|
||||
type exportTargetsMsg struct {
|
||||
targets []platform.RemovableTarget
|
||||
err error
|
||||
}
|
||||
|
||||
type panelMsg struct {
|
||||
data app.HardwarePanelData
|
||||
}
|
||||
|
||||
type nvidiaGPUsMsg struct {
|
||||
gpus []platform.NvidiaGPU
|
||||
err error
|
||||
}
|
||||
|
||||
type nvtopClosedMsg struct{}
|
||||
|
||||
type nvidiaSATDoneMsg struct {
|
||||
title string
|
||||
body string
|
||||
err error
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.targets) == 0 {
|
||||
return m, resultCmd("Export support bundle", "No removable filesystems found", nil, screenMain)
|
||||
}
|
||||
target := m.targets[m.cursor]
|
||||
m.selectedTarget = &target
|
||||
m.pendingAction = actionExportBundle
|
||||
m.screen = screenConfirm
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,307 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
// Component indices.
|
||||
const (
|
||||
hcGPU = 0
|
||||
hcMemory = 1
|
||||
hcStorage = 2
|
||||
hcCPU = 3
|
||||
)
|
||||
|
||||
// Cursor positions in Health Check screen.
|
||||
const (
|
||||
hcCurGPU = 0
|
||||
hcCurMemory = 1
|
||||
hcCurStorage = 2
|
||||
hcCurCPU = 3
|
||||
hcCurSelectAll = 4
|
||||
hcCurModeQuick = 5
|
||||
hcCurModeStd = 6
|
||||
hcCurModeExpr = 7
|
||||
hcCurRunAll = 8
|
||||
hcCurTotal = 9
|
||||
)
|
||||
|
||||
// hcModeDurations maps mode index (0=Quick,1=Standard,2=Express) to GPU stress seconds.
|
||||
var hcModeDurations = [3]int{600, 3600, 28800}
|
||||
|
||||
// hcCPUDurations maps mode index to CPU stress-ng seconds.
|
||||
var hcCPUDurations = [3]int{60, 300, 900}
|
||||
|
||||
func (m model) enterHealthCheck() (tea.Model, tea.Cmd) {
|
||||
m.screen = screenHealthCheck
|
||||
if !m.hcInitialized {
|
||||
m.hcSel = [4]bool{true, true, true, true}
|
||||
m.hcMode = 0
|
||||
m.hcCursor = 0
|
||||
m.hcInitialized = true
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateHealthCheck(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.hcCursor > 0 {
|
||||
m.hcCursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.hcCursor < hcCurTotal-1 {
|
||||
m.hcCursor++
|
||||
}
|
||||
case " ":
|
||||
switch m.hcCursor {
|
||||
case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
|
||||
m.hcSel[m.hcCursor] = !m.hcSel[m.hcCursor]
|
||||
case hcCurSelectAll:
|
||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||
for i := range m.hcSel {
|
||||
m.hcSel[i] = !allOn
|
||||
}
|
||||
case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
|
||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
||||
}
|
||||
case "enter":
|
||||
switch m.hcCursor {
|
||||
case hcCurGPU, hcCurMemory, hcCurStorage, hcCurCPU:
|
||||
return m.hcRunSingle(m.hcCursor)
|
||||
case hcCurSelectAll:
|
||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||
for i := range m.hcSel {
|
||||
m.hcSel[i] = !allOn
|
||||
}
|
||||
case hcCurModeQuick, hcCurModeStd, hcCurModeExpr:
|
||||
m.hcMode = m.hcCursor - hcCurModeQuick
|
||||
case hcCurRunAll:
|
||||
return m.hcRunAll()
|
||||
}
|
||||
case "g", "G":
|
||||
return m.hcRunSingle(hcGPU)
|
||||
case "m", "M":
|
||||
return m.hcRunSingle(hcMemory)
|
||||
case "s", "S":
|
||||
return m.hcRunSingle(hcStorage)
|
||||
case "c", "C":
|
||||
return m.hcRunSingle(hcCPU)
|
||||
case "r", "R":
|
||||
return m.hcRunAll()
|
||||
case "a", "A":
|
||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||
for i := range m.hcSel {
|
||||
m.hcSel[i] = !allOn
|
||||
}
|
||||
case "1":
|
||||
m.hcMode = 0
|
||||
case "2":
|
||||
m.hcMode = 1
|
||||
case "3":
|
||||
m.hcMode = 2
|
||||
case "esc":
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
case "q", "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) hcRunSingle(idx int) (tea.Model, tea.Cmd) {
|
||||
switch idx {
|
||||
case hcGPU:
|
||||
if m.app.DetectGPUVendor() == "amd" {
|
||||
m.pendingAction = actionRunAMDGPUSAT
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
m.nvidiaDurIdx = m.hcMode
|
||||
return m.enterNvidiaSATSetup()
|
||||
case hcMemory:
|
||||
m.pendingAction = actionRunMemorySAT
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case hcStorage:
|
||||
m.pendingAction = actionRunStorageSAT
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case hcCPU:
|
||||
m.pendingAction = actionRunCPUSAT
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) hcRunAll() (tea.Model, tea.Cmd) {
|
||||
for _, sel := range m.hcSel {
|
||||
if sel {
|
||||
m.pendingAction = actionRunAll
|
||||
m.screen = screenConfirm
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) executeRunAll() (tea.Model, tea.Cmd) {
|
||||
durationSec := hcModeDurations[m.hcMode]
|
||||
durationIdx := m.hcMode
|
||||
sel := m.hcSel
|
||||
app := m.app
|
||||
m.busy = true
|
||||
m.busyTitle = "Health Check"
|
||||
return m, func() tea.Msg {
|
||||
var parts []string
|
||||
if sel[hcGPU] {
|
||||
vendor := app.DetectGPUVendor()
|
||||
if vendor == "amd" {
|
||||
r, err := app.RunAMDAcceptancePackResult("")
|
||||
body := r.Body
|
||||
if err != nil {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
parts = append(parts, "=== GPU (AMD) ===\n"+body)
|
||||
} else {
|
||||
gpus, err := app.ListNvidiaGPUs()
|
||||
if err != nil || len(gpus) == 0 {
|
||||
parts = append(parts, "=== GPU ===\nNo NVIDIA GPUs detected or driver not loaded.")
|
||||
} else {
|
||||
var indices []int
|
||||
sizeMB := 0
|
||||
for _, g := range gpus {
|
||||
indices = append(indices, g.Index)
|
||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
||||
sizeMB = g.MemoryMB
|
||||
}
|
||||
}
|
||||
if sizeMB == 0 {
|
||||
sizeMB = 64
|
||||
}
|
||||
r, err := app.RunNvidiaAcceptancePackWithOptions(context.Background(), "", durationSec, sizeMB, indices)
|
||||
body := r.Body
|
||||
if err != nil {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
parts = append(parts, "=== GPU ===\n"+body)
|
||||
}
|
||||
}
|
||||
}
|
||||
if sel[hcMemory] {
|
||||
r, err := app.RunMemoryAcceptancePackResult("")
|
||||
body := r.Body
|
||||
if err != nil {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
parts = append(parts, "=== MEMORY ===\n"+body)
|
||||
}
|
||||
if sel[hcStorage] {
|
||||
r, err := app.RunStorageAcceptancePackResult("")
|
||||
body := r.Body
|
||||
if err != nil {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
parts = append(parts, "=== STORAGE ===\n"+body)
|
||||
}
|
||||
if sel[hcCPU] {
|
||||
cpuDur := hcCPUDurations[durationIdx]
|
||||
r, err := app.RunCPUAcceptancePackResult("", cpuDur)
|
||||
body := r.Body
|
||||
if err != nil {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
parts = append(parts, "=== CPU ===\n"+body)
|
||||
}
|
||||
combined := strings.Join(parts, "\n\n")
|
||||
if combined == "" {
|
||||
combined = "No components selected."
|
||||
}
|
||||
return resultMsg{title: "Health Check", body: combined, back: screenHealthCheck}
|
||||
}
|
||||
}
|
||||
|
||||
func renderHealthCheck(m model) string {
|
||||
var b strings.Builder
|
||||
|
||||
fmt.Fprintln(&b, "HEALTH CHECK")
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, " Diagnostics:")
|
||||
fmt.Fprintln(&b)
|
||||
|
||||
type comp struct{ name, desc, key string }
|
||||
comps := []comp{
|
||||
{"GPU", "nvidia/amd auto-detect", "G"},
|
||||
{"MEMORY", "memtester", "M"},
|
||||
{"STORAGE", "smartctl + NVMe self-test", "S"},
|
||||
{"CPU", "audit diagnostics", "C"},
|
||||
}
|
||||
for i, c := range comps {
|
||||
pfx := " "
|
||||
if m.hcCursor == i {
|
||||
pfx = "> "
|
||||
}
|
||||
ch := "[ ]"
|
||||
if m.hcSel[i] {
|
||||
ch = "[x]"
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s %-8s %-28s [%s]\n", pfx, ch, c.name, c.desc, c.key)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b, " ─────────────────────────────────────────────────")
|
||||
{
|
||||
pfx := " "
|
||||
if m.hcCursor == hcCurSelectAll {
|
||||
pfx = "> "
|
||||
}
|
||||
allOn := m.hcSel[0] && m.hcSel[1] && m.hcSel[2] && m.hcSel[3]
|
||||
ch := "[ ]"
|
||||
if allOn {
|
||||
ch = "[x]"
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s Select / Deselect All [A]\n", pfx, ch)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, " Mode:")
|
||||
modes := []struct{ label, key string }{
|
||||
{"Quick", "1"},
|
||||
{"Standard", "2"},
|
||||
{"Express", "3"},
|
||||
}
|
||||
for i, mode := range modes {
|
||||
pfx := " "
|
||||
if m.hcCursor == hcCurModeQuick+i {
|
||||
pfx = "> "
|
||||
}
|
||||
radio := "( )"
|
||||
if m.hcMode == i {
|
||||
radio = "(*)"
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
{
|
||||
pfx := " "
|
||||
if m.hcCursor == hcCurRunAll {
|
||||
pfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s[ RUN ALL [R] ]\n", pfx)
|
||||
}
|
||||
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
||||
fmt.Fprint(&b, "[↑↓] move [space/enter] toggle [letter] single test [R] run all [Esc] back")
|
||||
return b.String()
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0: // Health Check
|
||||
return m.enterHealthCheck()
|
||||
case 1: // Export support bundle
|
||||
m.pendingAction = actionExportBundle
|
||||
m.busy = true
|
||||
m.busyTitle = "Export support bundle"
|
||||
return m, func() tea.Msg {
|
||||
targets, err := m.app.ListRemovableTargets()
|
||||
return exportTargetsMsg{targets: targets, err: err}
|
||||
}
|
||||
case 2: // Settings
|
||||
m.screen = screenSettings
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case 3: // Exit
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.busy = true
|
||||
m.busyTitle = "Network status"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.NetworkStatus()
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case 1:
|
||||
m.busy = true
|
||||
m.busyTitle = "DHCP all interfaces"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.DHCPAllResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case 2:
|
||||
m.pendingAction = actionDHCPOne
|
||||
m.busy = true
|
||||
m.busyTitle = "Interfaces"
|
||||
return m, func() tea.Msg {
|
||||
ifaces, err := m.app.ListInterfaces()
|
||||
return interfacesMsg{ifaces: ifaces, err: err}
|
||||
}
|
||||
case 3:
|
||||
m.pendingAction = actionStaticIPv4
|
||||
m.busy = true
|
||||
m.busyTitle = "Interfaces"
|
||||
return m, func() tea.Msg {
|
||||
ifaces, err := m.app.ListInterfaces()
|
||||
return interfacesMsg{ifaces: ifaces, err: err}
|
||||
}
|
||||
case 4:
|
||||
m.screen = screenSettings
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) handleInterfacePickMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.interfaces) == 0 {
|
||||
return m, resultCmd("interfaces", "No physical interfaces found", nil, screenNetwork)
|
||||
}
|
||||
m.selectedIface = m.interfaces[m.cursor].Name
|
||||
switch m.pendingAction {
|
||||
case actionDHCPOne:
|
||||
m.busy = true
|
||||
m.busyTitle = "DHCP on " + m.selectedIface
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.DHCPOneResult(m.selectedIface)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case actionStaticIPv4:
|
||||
defaults := m.app.DefaultStaticIPv4FormFields(m.selectedIface)
|
||||
m.formFields = []formField{
|
||||
{Label: "IPv4 address", Value: defaults[0]},
|
||||
{Label: "Prefix", Value: defaults[1]},
|
||||
{Label: "Gateway", Value: strings.TrimSpace(defaults[2])},
|
||||
{Label: "DNS (space-separated)", Value: defaults[3]},
|
||||
}
|
||||
m.formIndex = 0
|
||||
m.screen = screenStaticForm
|
||||
return m, nil
|
||||
default:
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
@@ -1,238 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
var nvidiaDurationOptions = []struct {
|
||||
label string
|
||||
seconds int
|
||||
}{
|
||||
{"10 minutes", 600},
|
||||
{"1 hour", 3600},
|
||||
{"8 hours", 28800},
|
||||
{"24 hours", 86400},
|
||||
}
|
||||
|
||||
// enterNvidiaSATSetup resets the setup screen and starts loading GPU list.
|
||||
func (m model) enterNvidiaSATSetup() (tea.Model, tea.Cmd) {
|
||||
m.screen = screenNvidiaSATSetup
|
||||
m.nvidiaGPUs = nil
|
||||
m.nvidiaGPUSel = nil
|
||||
m.nvidiaDurIdx = 0
|
||||
m.nvidiaSATCursor = 0
|
||||
m.busy = true
|
||||
m.busyTitle = "NVIDIA SAT"
|
||||
return m, func() tea.Msg {
|
||||
gpus, err := m.app.ListNvidiaGPUs()
|
||||
return nvidiaGPUsMsg{gpus: gpus, err: err}
|
||||
}
|
||||
}
|
||||
|
||||
// handleNvidiaGPUsMsg processes the GPU list response.
|
||||
func (m model) handleNvidiaGPUsMsg(msg nvidiaGPUsMsg) (tea.Model, tea.Cmd) {
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "NVIDIA SAT"
|
||||
m.body = fmt.Sprintf("Failed to list GPUs: %v", msg.err)
|
||||
m.prevScreen = screenHealthCheck
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.nvidiaGPUs = msg.gpus
|
||||
m.nvidiaGPUSel = make([]bool, len(msg.gpus))
|
||||
for i := range m.nvidiaGPUSel {
|
||||
m.nvidiaGPUSel[i] = true // all selected by default
|
||||
}
|
||||
m.nvidiaSATCursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// updateNvidiaSATSetup handles keys on the setup screen.
|
||||
func (m model) updateNvidiaSATSetup(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
numDur := len(nvidiaDurationOptions)
|
||||
numGPU := len(m.nvidiaGPUs)
|
||||
totalItems := numDur + numGPU + 2 // +2: Start, Cancel
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.nvidiaSATCursor > 0 {
|
||||
m.nvidiaSATCursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.nvidiaSATCursor < totalItems-1 {
|
||||
m.nvidiaSATCursor++
|
||||
}
|
||||
case " ":
|
||||
switch {
|
||||
case m.nvidiaSATCursor < numDur:
|
||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
||||
case m.nvidiaSATCursor < numDur+numGPU:
|
||||
i := m.nvidiaSATCursor - numDur
|
||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
||||
}
|
||||
case "enter":
|
||||
startIdx := numDur + numGPU
|
||||
cancelIdx := startIdx + 1
|
||||
switch {
|
||||
case m.nvidiaSATCursor < numDur:
|
||||
m.nvidiaDurIdx = m.nvidiaSATCursor
|
||||
case m.nvidiaSATCursor < startIdx:
|
||||
i := m.nvidiaSATCursor - numDur
|
||||
m.nvidiaGPUSel[i] = !m.nvidiaGPUSel[i]
|
||||
case m.nvidiaSATCursor == startIdx:
|
||||
return m.startNvidiaSAT()
|
||||
case m.nvidiaSATCursor == cancelIdx:
|
||||
m.screen = screenHealthCheck
|
||||
m.cursor = 0
|
||||
}
|
||||
case "esc":
|
||||
m.screen = screenHealthCheck
|
||||
m.cursor = 0
|
||||
case "ctrl+c", "q":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// startNvidiaSAT launches the SAT and nvtop.
|
||||
func (m model) startNvidiaSAT() (tea.Model, tea.Cmd) {
|
||||
var selectedGPUs []platform.NvidiaGPU
|
||||
for i, sel := range m.nvidiaGPUSel {
|
||||
if sel {
|
||||
selectedGPUs = append(selectedGPUs, m.nvidiaGPUs[i])
|
||||
}
|
||||
}
|
||||
if len(selectedGPUs) == 0 {
|
||||
selectedGPUs = m.nvidiaGPUs // fallback: use all if none explicitly selected
|
||||
}
|
||||
|
||||
sizeMB := 0
|
||||
for _, g := range selectedGPUs {
|
||||
if sizeMB == 0 || g.MemoryMB < sizeMB {
|
||||
sizeMB = g.MemoryMB
|
||||
}
|
||||
}
|
||||
if sizeMB == 0 {
|
||||
sizeMB = 64
|
||||
}
|
||||
|
||||
var gpuIndices []int
|
||||
for _, g := range selectedGPUs {
|
||||
gpuIndices = append(gpuIndices, g.Index)
|
||||
}
|
||||
|
||||
durationSec := nvidiaDurationOptions[m.nvidiaDurIdx].seconds
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
m.nvidiaSATCancel = cancel
|
||||
m.nvidiaSATAborted = false
|
||||
m.screen = screenNvidiaSATRunning
|
||||
m.nvidiaSATCursor = 0
|
||||
|
||||
satCmd := func() tea.Msg {
|
||||
result, err := m.app.RunNvidiaAcceptancePackWithOptions(ctx, "", durationSec, sizeMB, gpuIndices)
|
||||
return nvidiaSATDoneMsg{title: result.Title, body: result.Body, err: err}
|
||||
}
|
||||
|
||||
nvtopPath, lookErr := exec.LookPath("nvtop")
|
||||
if lookErr != nil {
|
||||
// nvtop not available: just run the SAT, show running screen
|
||||
return m, satCmd
|
||||
}
|
||||
|
||||
return m, tea.Batch(
|
||||
satCmd,
|
||||
tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
// updateNvidiaSATRunning handles keys on the running screen.
|
||||
func (m model) updateNvidiaSATRunning(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "o", "O":
|
||||
nvtopPath, err := exec.LookPath("nvtop")
|
||||
if err != nil {
|
||||
return m, nil
|
||||
}
|
||||
return m, tea.ExecProcess(exec.Command(nvtopPath), func(_ error) tea.Msg {
|
||||
return nvtopClosedMsg{}
|
||||
})
|
||||
case "a", "A":
|
||||
if m.nvidiaSATCancel != nil {
|
||||
m.nvidiaSATCancel()
|
||||
m.nvidiaSATCancel = nil
|
||||
}
|
||||
m.nvidiaSATAborted = true
|
||||
m.screen = screenHealthCheck
|
||||
m.cursor = 0
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// renderNvidiaSATSetup renders the setup screen.
|
||||
func renderNvidiaSATSetup(m model) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintln(&b, "NVIDIA SAT")
|
||||
fmt.Fprintln(&b)
|
||||
fmt.Fprintln(&b, "Duration:")
|
||||
for i, opt := range nvidiaDurationOptions {
|
||||
radio := "( )"
|
||||
if i == m.nvidiaDurIdx {
|
||||
radio = "(*)"
|
||||
}
|
||||
prefix := " "
|
||||
if m.nvidiaSATCursor == i {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s %s\n", prefix, radio, opt.label)
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
if len(m.nvidiaGPUs) == 0 {
|
||||
fmt.Fprintln(&b, "GPUs: (none detected)")
|
||||
} else {
|
||||
fmt.Fprintln(&b, "GPUs:")
|
||||
for i, gpu := range m.nvidiaGPUs {
|
||||
check := "[ ]"
|
||||
if m.nvidiaGPUSel[i] {
|
||||
check = "[x]"
|
||||
}
|
||||
prefix := " "
|
||||
if m.nvidiaSATCursor == len(nvidiaDurationOptions)+i {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%s%s %d: %s (%d MB)\n", prefix, check, gpu.Index, gpu.Name, gpu.MemoryMB)
|
||||
}
|
||||
}
|
||||
fmt.Fprintln(&b)
|
||||
startIdx := len(nvidiaDurationOptions) + len(m.nvidiaGPUs)
|
||||
startPfx := " "
|
||||
cancelPfx := " "
|
||||
if m.nvidiaSATCursor == startIdx {
|
||||
startPfx = "> "
|
||||
}
|
||||
if m.nvidiaSATCursor == startIdx+1 {
|
||||
cancelPfx = "> "
|
||||
}
|
||||
fmt.Fprintf(&b, "%sStart\n", startPfx)
|
||||
fmt.Fprintf(&b, "%sCancel\n", cancelPfx)
|
||||
fmt.Fprintln(&b)
|
||||
b.WriteString("[↑/↓] move [space] toggle [enter] select [esc] cancel\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderNvidiaSATRunning renders the running screen.
|
||||
func renderNvidiaSATRunning() string {
|
||||
return "NVIDIA SAT\n\nTest is running...\n\n[o] Open nvtop [a] Abort test [ctrl+c] quit\n"
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleServicesMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.services) == 0 {
|
||||
return m, resultCmd("Services", "No bee-* services found.", nil, screenSettings)
|
||||
}
|
||||
m.selectedService = m.services[m.cursor]
|
||||
m.screen = screenServiceAction
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) handleServiceActionMenu() (tea.Model, tea.Cmd) {
|
||||
action := m.serviceMenu[m.cursor]
|
||||
if action == "back" {
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
m.busy = true
|
||||
m.busyTitle = "service: " + m.selectedService
|
||||
return m, func() tea.Msg {
|
||||
switch action {
|
||||
case "Status":
|
||||
result, err := m.app.ServiceStatusResult(m.selectedService)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Restart":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceRestart)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Start":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStart)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Stop":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStop)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
default:
|
||||
return resultMsg{title: "Service", body: "Unknown action.", back: screenServiceAction}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,64 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) handleSettingsMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0: // Network
|
||||
m.screen = screenNetwork
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case 1: // Services
|
||||
m.busy = true
|
||||
m.busyTitle = "Services"
|
||||
return m, func() tea.Msg {
|
||||
services, err := m.app.ListBeeServices()
|
||||
return servicesMsg{services: services, err: err}
|
||||
}
|
||||
case 2: // Re-run audit
|
||||
m.busy = true
|
||||
m.busyTitle = "Re-run audit"
|
||||
runtimeMode := m.runtimeMode
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunAuditNow(runtimeMode)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenSettings}
|
||||
}
|
||||
case 3: // Run self-check
|
||||
m.busy = true
|
||||
m.busyTitle = "Self-check"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunRuntimePreflightResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenSettings}
|
||||
}
|
||||
case 4: // Runtime issues
|
||||
m.busy = true
|
||||
m.busyTitle = "Runtime issues"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.RuntimeHealthResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
|
||||
}
|
||||
case 5: // Audit logs
|
||||
m.busy = true
|
||||
m.busyTitle = "Audit logs"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.AuditLogTailResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
|
||||
}
|
||||
case 6: // Check tools
|
||||
m.busy = true
|
||||
m.busyTitle = "Check tools"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.ToolCheckResult([]string{
|
||||
"dmidecode", "smartctl", "nvme", "ipmitool", "lspci",
|
||||
"ethtool", "bee", "nvidia-smi", "bee-gpu-stress",
|
||||
"memtester", "dhclient", "lsblk", "mount",
|
||||
})
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenSettings}
|
||||
}
|
||||
case 7: // Back
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,579 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func newTestModel() model {
|
||||
return newModel(app.New(platform.New()), runtimeenv.ModeLocal)
|
||||
}
|
||||
|
||||
func sendKey(t *testing.T, m model, key tea.KeyType) model {
|
||||
t.Helper()
|
||||
|
||||
next, _ := m.Update(tea.KeyMsg{Type: key})
|
||||
return next.(model)
|
||||
}
|
||||
|
||||
func TestUpdateMainMenuCursorNavigation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
m = sendKey(t, m, tea.KeyDown)
|
||||
if m.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after down", m.cursor)
|
||||
}
|
||||
|
||||
m = sendKey(t, m, tea.KeyDown)
|
||||
if m.cursor != 2 {
|
||||
t.Fatalf("cursor=%d want 2 after second down", m.cursor)
|
||||
}
|
||||
|
||||
m = sendKey(t, m, tea.KeyUp)
|
||||
if m.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after up", m.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMainMenuEnterActions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
wantScreen screen
|
||||
wantBusy bool
|
||||
wantCmd bool
|
||||
}{
|
||||
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
|
||||
{name: "export", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
{name: "settings", cursor: 2, wantScreen: screenSettings},
|
||||
{name: "exit", cursor: 3, wantScreen: screenMain, wantCmd: true},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.busy != test.wantBusy {
|
||||
t.Fatalf("busy=%v want %v", got.busy, test.wantBusy)
|
||||
}
|
||||
if (cmd != nil) != test.wantCmd {
|
||||
t.Fatalf("cmd present=%v want %v", cmd != nil, test.wantCmd)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateConfirmCancelViaKeys(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
m.pendingAction = actionRunMemorySAT
|
||||
|
||||
next, _ := m.Update(tea.KeyMsg{Type: tea.KeyRight})
|
||||
got := next.(model)
|
||||
if got.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after right", got.cursor)
|
||||
}
|
||||
|
||||
next, _ = got.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got = next.(model)
|
||||
if got.screen != screenHealthCheck {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenHealthCheck)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0 after cancel", got.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainMenuSimpleTransitions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
wantScreen screen
|
||||
}{
|
||||
{name: "health_check", cursor: 0, wantScreen: screenHealthCheck},
|
||||
{name: "settings", cursor: 2, wantScreen: screenSettings},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.handleMainMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatalf("expected nil cmd for %s", test.name)
|
||||
}
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainMenuExportSetsBusy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = 1 // Export support bundle
|
||||
|
||||
next, cmd := m.handleMainMenu()
|
||||
got := next.(model)
|
||||
|
||||
if !got.busy {
|
||||
t.Fatal("busy=false for export")
|
||||
}
|
||||
if cmd == nil {
|
||||
t.Fatal("expected async cmd for export")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainViewRendersTwoColumns(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = 1
|
||||
|
||||
view := m.View()
|
||||
for _, want := range []string{
|
||||
"bee",
|
||||
"Health Check",
|
||||
"> Export support bundle",
|
||||
"Settings",
|
||||
"Exit",
|
||||
"│",
|
||||
"[↑↓] move",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEscapeNavigation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
screen screen
|
||||
wantScreen screen
|
||||
}{
|
||||
{name: "network to settings", screen: screenNetwork, wantScreen: screenSettings},
|
||||
{name: "services to settings", screen: screenServices, wantScreen: screenSettings},
|
||||
{name: "settings to main", screen: screenSettings, wantScreen: screenMain},
|
||||
{name: "service action to services", screen: screenServiceAction, wantScreen: screenServices},
|
||||
{name: "export targets to main", screen: screenExportTargets, wantScreen: screenMain},
|
||||
{name: "interface pick to network", screen: screenInterfacePick, wantScreen: screenNetwork},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = test.screen
|
||||
m.cursor = 3
|
||||
|
||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthCheckEscReturnsToMain(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenHealthCheck
|
||||
m.hcCursor = 3
|
||||
|
||||
next, _ := m.updateHealthCheck(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenMain {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenMain)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutputScreenReturnsToPreviousScreen(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenOutput
|
||||
m.prevScreen = screenNetwork
|
||||
m.title = "title"
|
||||
m.body = "body"
|
||||
|
||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenNetwork {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenNetwork)
|
||||
}
|
||||
if got.title != "" || got.body != "" {
|
||||
t.Fatalf("expected output state cleared, got title=%q body=%q", got.title, got.body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthCheckGPUOpensNvidiaSATSetup(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenHealthCheck
|
||||
m.hcInitialized = true
|
||||
m.hcSel = [4]bool{true, true, true, true}
|
||||
|
||||
next, cmd := m.hcRunSingle(hcGPU)
|
||||
got := next.(model)
|
||||
|
||||
if cmd == nil {
|
||||
t.Fatal("expected non-nil cmd (GPU list loader)")
|
||||
}
|
||||
if got.screen != screenNvidiaSATSetup {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenNvidiaSATSetup)
|
||||
}
|
||||
|
||||
// esc from setup returns to health check
|
||||
next, _ = got.updateNvidiaSATSetup(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got = next.(model)
|
||||
if got.screen != screenHealthCheck {
|
||||
t.Fatalf("screen after esc=%q want %q", got.screen, screenHealthCheck)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealthCheckRunSingleMapsActions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
idx int
|
||||
want actionKind
|
||||
}{
|
||||
{idx: hcMemory, want: actionRunMemorySAT},
|
||||
{idx: hcStorage, want: actionRunStorageSAT},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
m := newTestModel()
|
||||
m.screen = screenHealthCheck
|
||||
m.hcInitialized = true
|
||||
|
||||
next, _ := m.hcRunSingle(test.idx)
|
||||
got := next.(model)
|
||||
if got.pendingAction != test.want {
|
||||
t.Fatalf("idx=%d pendingAction=%q want %q", test.idx, got.pendingAction, test.want)
|
||||
}
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("idx=%d screen=%q want %q", test.idx, got.screen, screenConfirm)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportTargetSelectionOpensConfirm(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenExportTargets
|
||||
m.targets = []platform.RemovableTarget{{Device: "/dev/sdb1", FSType: "vfat", Size: "16G"}}
|
||||
|
||||
next, cmd := m.handleExportTargetsMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
||||
}
|
||||
if got.pendingAction != actionExportBundle {
|
||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionExportBundle)
|
||||
}
|
||||
if got.selectedTarget == nil || got.selectedTarget.Device != "/dev/sdb1" {
|
||||
t.Fatalf("selectedTarget=%+v want /dev/sdb1", got.selectedTarget)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInterfacePickStaticIPv4OpensForm(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.pendingAction = actionStaticIPv4
|
||||
m.interfaces = []platform.InterfaceInfo{{Name: "eth0"}}
|
||||
|
||||
next, cmd := m.handleInterfacePickMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenStaticForm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenStaticForm)
|
||||
}
|
||||
if got.selectedIface != "eth0" {
|
||||
t.Fatalf("selectedIface=%q want eth0", got.selectedIface)
|
||||
}
|
||||
if len(got.formFields) != 4 {
|
||||
t.Fatalf("len(formFields)=%d want 4", len(got.formFields))
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgUsesExplicitBackScreen(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "done", body: "ok", back: screenNetwork})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenOutput {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
|
||||
}
|
||||
if got.prevScreen != screenNetwork {
|
||||
t.Fatalf("prevScreen=%q want %q", got.prevScreen, screenNetwork)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfirmCancelTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
m.pendingAction = actionExportBundle
|
||||
if got := m.confirmCancelTarget(); got != screenExportTargets {
|
||||
t.Fatalf("export cancel target=%q want %q", got, screenExportTargets)
|
||||
}
|
||||
|
||||
m.pendingAction = actionRunAll
|
||||
if got := m.confirmCancelTarget(); got != screenHealthCheck {
|
||||
t.Fatalf("run all cancel target=%q want %q", got, screenHealthCheck)
|
||||
}
|
||||
|
||||
m.pendingAction = actionRunMemorySAT
|
||||
if got := m.confirmCancelTarget(); got != screenHealthCheck {
|
||||
t.Fatalf("memory sat cancel target=%q want %q", got, screenHealthCheck)
|
||||
}
|
||||
|
||||
m.pendingAction = actionRunStorageSAT
|
||||
if got := m.confirmCancelTarget(); got != screenHealthCheck {
|
||||
t.Fatalf("storage sat cancel target=%q want %q", got, screenHealthCheck)
|
||||
}
|
||||
|
||||
m.pendingAction = actionNone
|
||||
if got := m.confirmCancelTarget(); got != screenMain {
|
||||
t.Fatalf("default cancel target=%q want %q", got, screenMain)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewBusyStateIsMinimal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
|
||||
view := m.View()
|
||||
want := "bee\n\nWorking...\n\n[ctrl+c] quit\n"
|
||||
if view != want {
|
||||
t.Fatalf("busy view mismatch\nwant:\n%s\ngot:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewBusyStateUsesBusyTitle(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
m.busyTitle = "Export support bundle"
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export support bundle",
|
||||
"Working...",
|
||||
"[ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenOutput
|
||||
m.title = "Run audit"
|
||||
m.body = "audit output: /appdata/bee/export/bee-audit.json\n"
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Run audit",
|
||||
"audit output: /appdata/bee/export/bee-audit.json",
|
||||
"[enter/esc] back [ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenExportTargets
|
||||
m.targets = []platform.RemovableTarget{
|
||||
{
|
||||
Device: "/dev/sdb1",
|
||||
FSType: "vfat",
|
||||
Size: "29G",
|
||||
Label: "BEEUSB",
|
||||
Mountpoint: "/media/bee",
|
||||
},
|
||||
}
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export support bundle",
|
||||
"Select removable filesystem",
|
||||
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewStaticFormRendersFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenStaticForm
|
||||
m.selectedIface = "enp1s0"
|
||||
m.formFields = []formField{
|
||||
{Label: "Address", Value: "192.0.2.10/24"},
|
||||
{Label: "Gateway", Value: "192.0.2.1"},
|
||||
{Label: "DNS", Value: "1.1.1.1"},
|
||||
}
|
||||
m.formIndex = 1
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Static IPv4: enp1s0",
|
||||
" Address: 192.0.2.10/24",
|
||||
"> Gateway: 192.0.2.1",
|
||||
" DNS: 1.1.1.1",
|
||||
"[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewConfirmScreenMatchesPendingExport(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
m.pendingAction = actionExportBundle
|
||||
m.selectedTarget = &platform.RemovableTarget{Device: "/dev/sdb1"}
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export support bundle",
|
||||
"Copy support bundle to /dev/sdb1?",
|
||||
"> Confirm",
|
||||
" Cancel",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgClearsBusyAndPendingAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
m.busyTitle = "Export support bundle"
|
||||
m.pendingAction = actionExportBundle
|
||||
m.screen = screenConfirm
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "Export support bundle", body: "done", back: screenMain})
|
||||
got := next.(model)
|
||||
|
||||
if got.busy {
|
||||
t.Fatal("busy=true want false")
|
||||
}
|
||||
if got.busyTitle != "" {
|
||||
t.Fatalf("busyTitle=%q want empty", got.busyTitle)
|
||||
}
|
||||
if got.pendingAction != actionNone {
|
||||
t.Fatalf("pendingAction=%q want empty", got.pendingAction)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgErrorWithoutBodyFormatsCleanly(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "Export support bundle", err: assertErr("boom"), back: screenMain})
|
||||
got := next.(model)
|
||||
|
||||
if got.body != "ERROR: boom" {
|
||||
t.Fatalf("body=%q want %q", got.body, "ERROR: boom")
|
||||
}
|
||||
}
|
||||
|
||||
type assertErr string
|
||||
|
||||
func (e assertErr) Error() string { return string(e) }
|
||||
@@ -1,192 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
type screen string
|
||||
|
||||
const (
|
||||
screenMain screen = "main"
|
||||
screenHealthCheck screen = "health_check"
|
||||
screenSettings screen = "settings"
|
||||
screenNetwork screen = "network"
|
||||
screenInterfacePick screen = "interface_pick"
|
||||
screenServices screen = "services"
|
||||
screenServiceAction screen = "service_action"
|
||||
screenExportTargets screen = "export_targets"
|
||||
screenOutput screen = "output"
|
||||
screenStaticForm screen = "static_form"
|
||||
screenConfirm screen = "confirm"
|
||||
screenNvidiaSATSetup screen = "nvidia_sat_setup"
|
||||
screenNvidiaSATRunning screen = "nvidia_sat_running"
|
||||
)
|
||||
|
||||
type actionKind string
|
||||
|
||||
const (
|
||||
actionNone actionKind = ""
|
||||
actionDHCPOne actionKind = "dhcp_one"
|
||||
actionStaticIPv4 actionKind = "static_ipv4"
|
||||
actionExportBundle actionKind = "export_bundle"
|
||||
actionRunAll actionKind = "run_all"
|
||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
||||
actionRunCPUSAT actionKind = "run_cpu_sat"
|
||||
actionRunAMDGPUSAT actionKind = "run_amd_gpu_sat"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
app *app.App
|
||||
runtimeMode runtimeenv.Mode
|
||||
|
||||
screen screen
|
||||
prevScreen screen
|
||||
cursor int
|
||||
busy bool
|
||||
busyTitle string
|
||||
title string
|
||||
body string
|
||||
mainMenu []string
|
||||
settingsMenu []string
|
||||
networkMenu []string
|
||||
serviceMenu []string
|
||||
|
||||
services []string
|
||||
interfaces []platform.InterfaceInfo
|
||||
targets []platform.RemovableTarget
|
||||
selectedService string
|
||||
selectedIface string
|
||||
selectedTarget *platform.RemovableTarget
|
||||
pendingAction actionKind
|
||||
|
||||
formFields []formField
|
||||
formIndex int
|
||||
|
||||
// Hardware panel (right column)
|
||||
panel app.HardwarePanelData
|
||||
panelFocus bool
|
||||
panelCursor int
|
||||
|
||||
// Health Check screen
|
||||
hcSel [4]bool
|
||||
hcMode int
|
||||
hcCursor int
|
||||
hcInitialized bool
|
||||
|
||||
// NVIDIA SAT setup
|
||||
nvidiaGPUs []platform.NvidiaGPU
|
||||
nvidiaGPUSel []bool
|
||||
nvidiaDurIdx int
|
||||
nvidiaSATCursor int
|
||||
|
||||
// NVIDIA SAT running
|
||||
nvidiaSATCancel func()
|
||||
nvidiaSATAborted bool
|
||||
|
||||
// SAT verbose progress (CPU / Memory / Storage / AMD GPU)
|
||||
progressLines []string
|
||||
progressPrefix string
|
||||
progressSince time.Time
|
||||
}
|
||||
|
||||
type formField struct {
|
||||
Label string
|
||||
Value string
|
||||
}
|
||||
|
||||
func Run(application *app.App, runtimeMode runtimeenv.Mode) error {
|
||||
options := []tea.ProgramOption{}
|
||||
if runtimeMode != runtimeenv.ModeLiveCD {
|
||||
options = append(options, tea.WithAltScreen())
|
||||
}
|
||||
program := tea.NewProgram(newModel(application, runtimeMode), options...)
|
||||
_, err := program.Run()
|
||||
return err
|
||||
}
|
||||
|
||||
func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
|
||||
return model{
|
||||
app: application,
|
||||
runtimeMode: runtimeMode,
|
||||
screen: screenMain,
|
||||
mainMenu: []string{
|
||||
"Health Check",
|
||||
"Export support bundle",
|
||||
"Settings",
|
||||
"Exit",
|
||||
},
|
||||
settingsMenu: []string{
|
||||
"Network",
|
||||
"Services",
|
||||
"Re-run audit",
|
||||
"Run self-check",
|
||||
"Runtime issues",
|
||||
"Audit logs",
|
||||
"Check tools",
|
||||
"Back",
|
||||
},
|
||||
networkMenu: []string{
|
||||
"Show status",
|
||||
"DHCP on all interfaces",
|
||||
"DHCP on one interface",
|
||||
"Set static IPv4",
|
||||
"Back",
|
||||
},
|
||||
serviceMenu: []string{
|
||||
"Status",
|
||||
"Restart",
|
||||
"Start",
|
||||
"Stop",
|
||||
"Back",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (m model) Init() tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
return panelMsg{data: m.app.LoadHardwarePanel()}
|
||||
}
|
||||
}
|
||||
|
||||
func (m model) confirmBody() (string, string) {
|
||||
switch m.pendingAction {
|
||||
case actionExportBundle:
|
||||
if m.selectedTarget == nil {
|
||||
return "Export support bundle", "No target selected"
|
||||
}
|
||||
return "Export support bundle", "Copy support bundle to " + m.selectedTarget.Device + "?"
|
||||
case actionRunAll:
|
||||
modes := []string{"Quick", "Standard", "Express"}
|
||||
mode := modes[m.hcMode]
|
||||
var sel []string
|
||||
names := []string{"GPU", "Memory", "Storage", "CPU"}
|
||||
for i, on := range m.hcSel {
|
||||
if on {
|
||||
sel = append(sel, names[i])
|
||||
}
|
||||
}
|
||||
if len(sel) == 0 {
|
||||
return "Health Check", "No components selected."
|
||||
}
|
||||
return "Health Check", "Run: " + strings.Join(sel, " + ") + "\nMode: " + mode
|
||||
case actionRunMemorySAT:
|
||||
return "Memory test", "Run memtester?"
|
||||
case actionRunStorageSAT:
|
||||
return "Storage test", "Run storage diagnostic pack?"
|
||||
case actionRunCPUSAT:
|
||||
modes := []string{"Quick (60s)", "Standard (300s)", "Express (900s)"}
|
||||
return "CPU test", "Run stress-ng? Mode: " + modes[m.hcMode]
|
||||
case actionRunAMDGPUSAT:
|
||||
return "AMD GPU test", "Run AMD GPU diagnostic pack (rocm-smi)?"
|
||||
default:
|
||||
return "Confirm", "Proceed?"
|
||||
}
|
||||
}
|
||||
@@ -1,255 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
switch msg := msg.(type) {
|
||||
case tea.KeyMsg:
|
||||
if m.busy {
|
||||
if msg.String() == "ctrl+c" {
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
return m.updateKey(msg)
|
||||
case satProgressMsg:
|
||||
if m.busy && m.progressPrefix != "" {
|
||||
if len(msg.lines) > 0 {
|
||||
m.progressLines = msg.lines
|
||||
}
|
||||
return m, pollSATProgress(m.progressPrefix, m.progressSince)
|
||||
}
|
||||
return m, nil
|
||||
case resultMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
m.progressLines = nil
|
||||
m.progressPrefix = ""
|
||||
m.title = msg.title
|
||||
if msg.err != nil {
|
||||
body := strings.TrimSpace(msg.body)
|
||||
if body == "" {
|
||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
||||
} else {
|
||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
||||
}
|
||||
} else {
|
||||
m.body = msg.body
|
||||
}
|
||||
m.pendingAction = actionNone
|
||||
if msg.back != "" {
|
||||
m.prevScreen = msg.back
|
||||
} else {
|
||||
m.prevScreen = m.screen
|
||||
}
|
||||
m.screen = screenOutput
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case servicesMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "Services"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenSettings
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.services = msg.services
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case interfacesMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "interfaces"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenNetwork
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.interfaces = msg.ifaces
|
||||
m.screen = screenInterfacePick
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case exportTargetsMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "export"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.targets = msg.targets
|
||||
m.screen = screenExportTargets
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case panelMsg:
|
||||
m.panel = msg.data
|
||||
return m, nil
|
||||
case nvidiaGPUsMsg:
|
||||
return m.handleNvidiaGPUsMsg(msg)
|
||||
case nvtopClosedMsg:
|
||||
return m, nil
|
||||
case nvidiaSATDoneMsg:
|
||||
if m.nvidiaSATAborted {
|
||||
return m, nil
|
||||
}
|
||||
if m.nvidiaSATCancel != nil {
|
||||
m.nvidiaSATCancel()
|
||||
m.nvidiaSATCancel = nil
|
||||
}
|
||||
m.prevScreen = screenHealthCheck
|
||||
m.screen = screenOutput
|
||||
m.title = msg.title
|
||||
if msg.err != nil {
|
||||
body := strings.TrimSpace(msg.body)
|
||||
if body == "" {
|
||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
||||
} else {
|
||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
||||
}
|
||||
} else {
|
||||
m.body = msg.body
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch m.screen {
|
||||
case screenMain:
|
||||
return m.updateMain(msg)
|
||||
case screenHealthCheck:
|
||||
return m.updateHealthCheck(msg)
|
||||
case screenSettings:
|
||||
return m.updateMenu(msg, len(m.settingsMenu), m.handleSettingsMenu)
|
||||
case screenNetwork:
|
||||
return m.updateMenu(msg, len(m.networkMenu), m.handleNetworkMenu)
|
||||
case screenServices:
|
||||
return m.updateMenu(msg, len(m.services), m.handleServicesMenu)
|
||||
case screenServiceAction:
|
||||
return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu)
|
||||
case screenNvidiaSATSetup:
|
||||
return m.updateNvidiaSATSetup(msg)
|
||||
case screenNvidiaSATRunning:
|
||||
return m.updateNvidiaSATRunning(msg)
|
||||
case screenExportTargets:
|
||||
return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
|
||||
case screenInterfacePick:
|
||||
return m.updateMenu(msg, len(m.interfaces), m.handleInterfacePickMenu)
|
||||
case screenOutput:
|
||||
switch msg.String() {
|
||||
case "esc", "enter", "q":
|
||||
m.screen = m.prevScreen
|
||||
m.body = ""
|
||||
m.title = ""
|
||||
m.pendingAction = actionNone
|
||||
// Refresh panel when returning to main screen.
|
||||
if m.prevScreen == screenMain {
|
||||
return m, func() tea.Msg { return panelMsg{data: m.app.LoadHardwarePanel()} }
|
||||
}
|
||||
return m, nil
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
case screenStaticForm:
|
||||
return m.updateStaticForm(msg)
|
||||
case screenConfirm:
|
||||
return m.updateConfirm(msg)
|
||||
}
|
||||
if msg.String() == "ctrl+c" {
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// updateMain handles keys on the main (two-column) screen.
|
||||
func (m model) updateMain(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
if m.panelFocus {
|
||||
return m.updateMainPanel(msg)
|
||||
}
|
||||
// Switch focus to right panel.
|
||||
if (msg.String() == "tab" || msg.String() == "right" || msg.String() == "l") && len(m.panel.Rows) > 0 {
|
||||
m.panelFocus = true
|
||||
return m, nil
|
||||
}
|
||||
return m.updateMenu(msg, len(m.mainMenu), m.handleMainMenu)
|
||||
}
|
||||
|
||||
// updateMainPanel handles keys when right panel has focus.
|
||||
func (m model) updateMainPanel(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.panelCursor > 0 {
|
||||
m.panelCursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.panelCursor < len(m.panel.Rows)-1 {
|
||||
m.panelCursor++
|
||||
}
|
||||
case "enter":
|
||||
if m.panelCursor < len(m.panel.Rows) {
|
||||
key := m.panel.Rows[m.panelCursor].Key
|
||||
m.busy = true
|
||||
m.busyTitle = key
|
||||
return m, func() tea.Msg {
|
||||
r := m.app.ComponentDetailResult(key)
|
||||
return resultMsg{title: r.Title, body: r.Body, back: screenMain}
|
||||
}
|
||||
}
|
||||
case "tab", "left", "h", "esc":
|
||||
m.panelFocus = false
|
||||
case "q", "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateMenu(msg tea.KeyMsg, size int, onEnter func() (tea.Model, tea.Cmd)) (tea.Model, tea.Cmd) {
|
||||
if size == 0 {
|
||||
size = 1
|
||||
}
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.cursor > 0 {
|
||||
m.cursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.cursor < size-1 {
|
||||
m.cursor++
|
||||
}
|
||||
case "enter":
|
||||
return onEnter()
|
||||
case "esc":
|
||||
switch m.screen {
|
||||
case screenNetwork, screenServices:
|
||||
m.screen = screenSettings
|
||||
m.cursor = 0
|
||||
case screenSettings:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
case screenServiceAction:
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
case screenExportTargets:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
case screenInterfacePick:
|
||||
m.screen = screenNetwork
|
||||
m.cursor = 0
|
||||
}
|
||||
case "q", "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,233 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
"github.com/charmbracelet/lipgloss"
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
// Column widths for two-column main layout.
|
||||
const leftColWidth = 30
|
||||
|
||||
var (
|
||||
stylePass = lipgloss.NewStyle().Foreground(lipgloss.Color("10")) // bright green
|
||||
styleFail = lipgloss.NewStyle().Foreground(lipgloss.Color("9")) // bright red
|
||||
styleCancel = lipgloss.NewStyle().Foreground(lipgloss.Color("11")) // bright yellow
|
||||
styleNA = lipgloss.NewStyle().Foreground(lipgloss.Color("8")) // dark gray
|
||||
)
|
||||
|
||||
func colorStatus(status string) string {
|
||||
switch status {
|
||||
case "PASS":
|
||||
return stylePass.Render("PASS")
|
||||
case "FAIL":
|
||||
return styleFail.Render("FAIL")
|
||||
case "CANCEL":
|
||||
return styleCancel.Render("CANC")
|
||||
default:
|
||||
return styleNA.Render("N/A ")
|
||||
}
|
||||
}
|
||||
|
||||
func (m model) View() string {
|
||||
if m.busy {
|
||||
title := "bee"
|
||||
if m.busyTitle != "" {
|
||||
title = m.busyTitle
|
||||
}
|
||||
if len(m.progressLines) > 0 {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "%s\n\n", title)
|
||||
for _, l := range m.progressLines {
|
||||
fmt.Fprintf(&b, " %s\n", l)
|
||||
}
|
||||
b.WriteString("\n[ctrl+c] quit\n")
|
||||
return b.String()
|
||||
}
|
||||
return fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
|
||||
}
|
||||
switch m.screen {
|
||||
case screenMain:
|
||||
return renderTwoColumnMain(m)
|
||||
case screenHealthCheck:
|
||||
return renderHealthCheck(m)
|
||||
case screenSettings:
|
||||
return renderMenu("Settings", "Select action", m.settingsMenu, m.cursor)
|
||||
case screenNetwork:
|
||||
return renderMenu("Network", "Select action", m.networkMenu, m.cursor)
|
||||
case screenServices:
|
||||
return renderMenu("Services", "Select service", m.services, m.cursor)
|
||||
case screenServiceAction:
|
||||
return renderMenu("Service: "+m.selectedService, "Select action", m.serviceMenu, m.cursor)
|
||||
case screenExportTargets:
|
||||
return renderMenu("Export support bundle", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
|
||||
case screenInterfacePick:
|
||||
return renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
|
||||
case screenStaticForm:
|
||||
return renderForm("Static IPv4: "+m.selectedIface, m.formFields, m.formIndex)
|
||||
case screenConfirm:
|
||||
title, body := m.confirmBody()
|
||||
return renderConfirm(title, body, m.cursor)
|
||||
case screenNvidiaSATSetup:
|
||||
return renderNvidiaSATSetup(m)
|
||||
case screenNvidiaSATRunning:
|
||||
return renderNvidiaSATRunning()
|
||||
case screenOutput:
|
||||
return fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||
default:
|
||||
return "bee\n"
|
||||
}
|
||||
}
|
||||
|
||||
// renderTwoColumnMain renders the main screen with menu on the left and hardware panel on the right.
|
||||
func renderTwoColumnMain(m model) string {
|
||||
// Left column lines
|
||||
leftLines := []string{"bee", ""}
|
||||
for i, item := range m.mainMenu {
|
||||
pfx := " "
|
||||
if !m.panelFocus && m.cursor == i {
|
||||
pfx = "> "
|
||||
}
|
||||
leftLines = append(leftLines, pfx+item)
|
||||
}
|
||||
|
||||
// Right column lines
|
||||
rightLines := buildPanelLines(m)
|
||||
|
||||
// Render side by side
|
||||
var b strings.Builder
|
||||
maxRows := max(len(leftLines), len(rightLines))
|
||||
for i := 0; i < maxRows; i++ {
|
||||
l := ""
|
||||
if i < len(leftLines) {
|
||||
l = leftLines[i]
|
||||
}
|
||||
r := ""
|
||||
if i < len(rightLines) {
|
||||
r = rightLines[i]
|
||||
}
|
||||
w := lipgloss.Width(l)
|
||||
if w < leftColWidth {
|
||||
l += strings.Repeat(" ", leftColWidth-w)
|
||||
}
|
||||
b.WriteString(l + " │ " + r + "\n")
|
||||
}
|
||||
|
||||
sep := strings.Repeat("─", leftColWidth) + "─┴─" + strings.Repeat("─", 46)
|
||||
b.WriteString(sep + "\n")
|
||||
|
||||
if m.panelFocus {
|
||||
b.WriteString("[↑↓] move [enter] details [tab/←] menu [ctrl+c] quit\n")
|
||||
} else {
|
||||
b.WriteString("[↑↓] move [enter] select [tab/→] panel [ctrl+c] quit\n")
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func buildPanelLines(m model) []string {
|
||||
p := m.panel
|
||||
var lines []string
|
||||
|
||||
for _, h := range p.Header {
|
||||
lines = append(lines, h)
|
||||
}
|
||||
if len(p.Header) > 0 && len(p.Rows) > 0 {
|
||||
lines = append(lines, "")
|
||||
}
|
||||
|
||||
for i, row := range p.Rows {
|
||||
pfx := " "
|
||||
if m.panelFocus && m.panelCursor == i {
|
||||
pfx = "> "
|
||||
}
|
||||
status := colorStatus(row.Status)
|
||||
lines = append(lines, fmt.Sprintf("%s%s %-4s %s", pfx, status, row.Key, row.Detail))
|
||||
}
|
||||
|
||||
return lines
|
||||
}
|
||||
|
||||
func renderTargetItems(targets []platform.RemovableTarget) []string {
|
||||
items := make([]string, 0, len(targets))
|
||||
for _, target := range targets {
|
||||
desc := fmt.Sprintf("%s [%s %s]", target.Device, target.FSType, target.Size)
|
||||
if target.Label != "" {
|
||||
desc += " label=" + target.Label
|
||||
}
|
||||
if target.Mountpoint != "" {
|
||||
desc += " mounted=" + target.Mountpoint
|
||||
}
|
||||
items = append(items, desc)
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func renderInterfaceItems(interfaces []platform.InterfaceInfo) []string {
|
||||
items := make([]string, 0, len(interfaces))
|
||||
for _, iface := range interfaces {
|
||||
label := iface.Name
|
||||
if len(iface.IPv4) > 0 {
|
||||
label += " [" + strings.Join(iface.IPv4, ", ") + "]"
|
||||
}
|
||||
items = append(items, label)
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func renderMenu(title, subtitle string, items []string, cursor int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n%s\n\n", title, subtitle)
|
||||
if len(items) == 0 {
|
||||
body.WriteString("(no items)\n")
|
||||
} else {
|
||||
for i, item := range items {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s\n", prefix, item)
|
||||
}
|
||||
}
|
||||
body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderForm(title string, fields []formField, idx int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n", title)
|
||||
for i, field := range fields {
|
||||
prefix := " "
|
||||
if i == idx {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s: %s\n", prefix, field.Label, field.Value)
|
||||
}
|
||||
body.WriteString("\n[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderConfirm(title, body string, cursor int) string {
|
||||
options := []string{"Confirm", "Cancel"}
|
||||
var out strings.Builder
|
||||
fmt.Fprintf(&out, "%s\n\n%s\n\n", title, body)
|
||||
for i, option := range options {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&out, "%s%s\n", prefix, option)
|
||||
}
|
||||
out.WriteString("\n[←/→/↑/↓] move [enter] select [esc] cancel\n")
|
||||
return out.String()
|
||||
}
|
||||
|
||||
func resultCmd(title, body string, err error, back screen) tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
return resultMsg{title: title, body: body, err: err, back: back}
|
||||
}
|
||||
}
|
||||
1058
audit/internal/webui/api.go
Normal file
1058
audit/internal/webui/api.go
Normal file
File diff suppressed because it is too large
Load Diff
92
audit/internal/webui/api_test.go
Normal file
92
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||
t.Setenv("DISPLAY", "")
|
||||
t.Setenv("XAUTHORITY", "")
|
||||
|
||||
cmd := xrandrCommand("--query")
|
||||
|
||||
var hasDisplay bool
|
||||
var hasXAuthority bool
|
||||
for _, kv := range cmd.Env {
|
||||
if kv == "DISPLAY=:0" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||
}
|
||||
if !hasXAuthority {
|
||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||
req.ContentLength = -1
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_A", RPM: 4200},
|
||||
{Name: "FAN_B", RPM: 5100},
|
||||
})
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_B", RPM: 5200},
|
||||
})
|
||||
|
||||
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||
t.Fatalf("fanNames=%v", h.fanNames)
|
||||
}
|
||||
aVals, _ := h.ringFans[0].snapshot()
|
||||
bVals, _ := h.ringFans[1].snapshot()
|
||||
if len(aVals) != 2 || len(bVals) != 2 {
|
||||
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||
}
|
||||
if aVals[1] != 4200 {
|
||||
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||
}
|
||||
if bVals[1] != 5200 {
|
||||
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||
}
|
||||
}
|
||||
137
audit/internal/webui/jobs.go
Normal file
137
audit/internal/webui/jobs.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// jobState holds the output lines and completion status of an async job.
|
||||
type jobState struct {
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
func (j *jobState) abort() bool {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
if j.done || j.cancel == nil {
|
||||
return false
|
||||
}
|
||||
j.cancel()
|
||||
return true
|
||||
}
|
||||
|
||||
func (j *jobState) append(line string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
appendJobLog(j.logPath, line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
select {
|
||||
case ch <- line:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (j *jobState) finish(errMsg string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.done = true
|
||||
j.err = errMsg
|
||||
for _, ch := range j.subs {
|
||||
close(ch)
|
||||
}
|
||||
j.subs = nil
|
||||
}
|
||||
|
||||
// subscribe returns a channel that receives all future lines.
|
||||
// Existing lines are returned first, then the channel streams new ones.
|
||||
func (j *jobState) subscribe() ([]string, <-chan string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
existing := make([]string, len(j.lines))
|
||||
copy(existing, j.lines)
|
||||
if j.done {
|
||||
return existing, nil
|
||||
}
|
||||
ch := make(chan string, 256)
|
||||
j.subs = append(j.subs, ch)
|
||||
return existing, ch
|
||||
}
|
||||
|
||||
// jobManager manages async jobs identified by string IDs.
|
||||
type jobManager struct {
|
||||
mu sync.Mutex
|
||||
jobs map[string]*jobState
|
||||
}
|
||||
|
||||
var globalJobs = &jobManager{jobs: make(map[string]*jobState)}
|
||||
|
||||
func (m *jobManager) create(id string) *jobState {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
j := &jobState{}
|
||||
m.jobs[id] = j
|
||||
// Schedule cleanup after 30 minutes
|
||||
go func() {
|
||||
time.Sleep(30 * time.Minute)
|
||||
m.mu.Lock()
|
||||
delete(m.jobs, id)
|
||||
m.mu.Unlock()
|
||||
}()
|
||||
return j
|
||||
}
|
||||
|
||||
// isDone returns true if the job has finished (either successfully or with error).
|
||||
func (j *jobState) isDone() bool {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
return j.done
|
||||
}
|
||||
|
||||
func (m *jobManager) get(id string) (*jobState, bool) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
j, ok := m.jobs[id]
|
||||
return j, ok
|
||||
}
|
||||
|
||||
func newTaskJobState(logPath string) *jobState {
|
||||
j := &jobState{logPath: logPath}
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
data, err := os.ReadFile(logPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return j
|
||||
}
|
||||
lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
|
||||
if len(lines) > 0 && lines[len(lines)-1] == "" {
|
||||
lines = lines[:len(lines)-1]
|
||||
}
|
||||
j.lines = append(j.lines, lines...)
|
||||
return j
|
||||
}
|
||||
|
||||
func appendJobLog(path, line string) {
|
||||
if path == "" {
|
||||
return
|
||||
}
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
_, _ = f.WriteString(line + "\n")
|
||||
}
|
||||
238
audit/internal/webui/kmsg_watcher.go
Normal file
238
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,238 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||
// while any SAT task is running, and flushed when all tasks complete.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeCount int // number of in-flight SAT tasks
|
||||
window *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
targets []string // SAT targets running concurrently
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
go w.run()
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||
// if this is the first task starting.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if w.activeCount == 0 {
|
||||
w.window = &kmsgWindow{
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
w.activeCount++
|
||||
if w.window != nil {
|
||||
w.window.targets = append(w.window.targets, target)
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||
// it flushes the accumulated events to the status DB.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
w.activeCount--
|
||||
var window *kmsgWindow
|
||||
if w.activeCount <= 0 {
|
||||
w.activeCount = 0
|
||||
window = w.window
|
||||
w.window = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
go w.flushWindow(window)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
334
audit/internal/webui/metricsdb.go
Normal file
334
audit/internal/webui/metricsdb.go
Normal file
@@ -0,0 +1,334 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||
|
||||
// MetricsDB persists live metric samples to SQLite.
|
||||
type MetricsDB struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db.SetMaxOpenConns(1)
|
||||
if err := initMetricsSchema(db); err != nil {
|
||||
_ = db.Close()
|
||||
return nil, err
|
||||
}
|
||||
return &MetricsDB{db: db}, nil
|
||||
}
|
||||
|
||||
func initMetricsSchema(db *sql.DB) error {
|
||||
_, err := db.Exec(`
|
||||
CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
gpu_index INTEGER NOT NULL,
|
||||
temp_c REAL,
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
rpm REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
grp TEXT NOT NULL,
|
||||
celsius REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
return err
|
||||
}
|
||||
|
||||
// Write inserts one sample into all relevant tables.
|
||||
func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
ts := s.Timestamp.Unix()
|
||||
tx, err := m.db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, f := range s.Fans {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
|
||||
ts, f.Name, f.RPM,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
|
||||
ts, t.Name, t.Group, t.Celsius,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
}
|
||||
var sysRows []sysRow
|
||||
for rows.Next() {
|
||||
var r sysRow
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||
continue
|
||||
}
|
||||
sysRows = append(sysRows, r)
|
||||
}
|
||||
if len(sysRows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
// Collect min/max ts for range query
|
||||
minTS := sysRows[0].ts
|
||||
maxTS := sysRows[len(sysRows)-1].ts
|
||||
|
||||
// Load GPU rows in range
|
||||
type gpuKey struct {
|
||||
ts int64
|
||||
idx int
|
||||
}
|
||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||
gRows, err := m.db.Query(
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer gRows.Close()
|
||||
for gRows.Next() {
|
||||
var ts int64
|
||||
var g platform.GPUMetricRow
|
||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
||||
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load fan rows in range
|
||||
type fanKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
fanData := map[fanKey]float64{}
|
||||
fRows, err := m.db.Query(
|
||||
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer fRows.Close()
|
||||
for fRows.Next() {
|
||||
var ts int64
|
||||
var name string
|
||||
var rpm float64
|
||||
if err := fRows.Scan(&ts, &name, &rpm); err == nil {
|
||||
fanData[fanKey{ts, name}] = rpm
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load temp rows in range
|
||||
type tempKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
tempData := map[tempKey]platform.TempReading{}
|
||||
tRows, err := m.db.Query(
|
||||
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer tRows.Close()
|
||||
for tRows.Next() {
|
||||
var ts int64
|
||||
var t platform.TempReading
|
||||
if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
|
||||
tempData[tempKey{ts, t.Name}] = t
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||
// Sort each list so that sample reconstruction is deterministic regardless
|
||||
// of Go's non-deterministic map iteration order.
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
if !seenGPU[k.idx] {
|
||||
seenGPU[k.idx] = true
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
sort.Ints(gpuIndices)
|
||||
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
if !seenFan[k.name] {
|
||||
seenFan[k.name] = true
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(fanNames)
|
||||
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
if !seenTemp[k.name] {
|
||||
seenTemp[k.name] = true
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(tempNames)
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
s := platform.LiveMetricSample{
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||
s.GPUs = append(s.GPUs, g)
|
||||
}
|
||||
}
|
||||
for _, name := range fanNames {
|
||||
if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
|
||||
s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
|
||||
}
|
||||
}
|
||||
for _, name := range tempNames {
|
||||
if t, ok := tempData[tempKey{r.ts, name}]; ok {
|
||||
s.Temps = append(s.Temps, t)
|
||||
}
|
||||
}
|
||||
samples[i] = s
|
||||
}
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
rows, err := m.db.Query(`
|
||||
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
||||
FROM sys_metrics s
|
||||
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||
ORDER BY s.ts, g.gpu_index
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
cw := csv.NewWriter(w)
|
||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
||||
for rows.Next() {
|
||||
var ts int64
|
||||
var cpu, mem, pwr float64
|
||||
var gpuIdx sql.NullInt64
|
||||
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
||||
continue
|
||||
}
|
||||
row := []string{
|
||||
strconv.FormatInt(ts, 10),
|
||||
strconv.FormatFloat(cpu, 'f', 2, 64),
|
||||
strconv.FormatFloat(mem, 'f', 2, 64),
|
||||
strconv.FormatFloat(pwr, 'f', 1, 64),
|
||||
}
|
||||
if gpuIdx.Valid {
|
||||
row = append(row,
|
||||
strconv.FormatInt(gpuIdx.Int64, 10),
|
||||
strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||
)
|
||||
} else {
|
||||
row = append(row, "", "", "", "", "")
|
||||
}
|
||||
_ = cw.Write(row)
|
||||
}
|
||||
cw.Flush()
|
||||
return cw.Error()
|
||||
}
|
||||
|
||||
// Close closes the database.
|
||||
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
||||
|
||||
func nullFloat(v float64) sql.NullFloat64 {
|
||||
return sql.NullFloat64{Float64: v, Valid: true}
|
||||
}
|
||||
69
audit/internal/webui/metricsdb_test.go
Normal file
69
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 3; i++ {
|
||||
err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||
CPULoadPct: float64(10 + i),
|
||||
MemLoadPct: float64(20 + i),
|
||||
PowerW: float64(300 + i),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
all, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(all) != 3 {
|
||||
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||
}
|
||||
for i, sample := range all {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||
}
|
||||
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||
}
|
||||
}
|
||||
|
||||
recent, err := db.LoadRecent(2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadRecent: %v", err)
|
||||
}
|
||||
if len(recent) != 2 {
|
||||
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||
}
|
||||
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||
}
|
||||
for i, sample := range recent {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
}
|
||||
}
|
||||
1770
audit/internal/webui/pages.go
Normal file
1770
audit/internal/webui/pages.go
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -7,9 +7,263 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestRootRendersShellWithIframe(t *testing.T) {
|
||||
func TestChartLegendNumber(t *testing.T) {
|
||||
tests := []struct {
|
||||
in float64
|
||||
want string
|
||||
}{
|
||||
{in: 0.4, want: "0"},
|
||||
{in: 61.5, want: "62"},
|
||||
{in: 999.4, want: "999"},
|
||||
{in: 1200, want: "1,2k"},
|
||||
{in: 1250, want: "1,25k"},
|
||||
{in: 1310, want: "1,31k"},
|
||||
{in: 1500, want: "1,5k"},
|
||||
{in: 2600, want: "2,6k"},
|
||||
{in: 10200, want: "10k"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := chartLegendNumber(tc.in); got != tc.want {
|
||||
t.Fatalf("chartLegendNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||
CPULoadPct: 10,
|
||||
MemLoadPct: 20,
|
||||
PowerW: 300,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
CPULoadPct: 30,
|
||||
MemLoadPct: 40,
|
||||
PowerW: 320,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
CPULoadPct: 50,
|
||||
MemLoadPct: 60,
|
||||
PowerW: 340,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if len(names) != 1 || names[0] != "GPU 0" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != len(samples) {
|
||||
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||
}
|
||||
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||
t.Fatalf("datasets shape=%v", datasets)
|
||||
}
|
||||
if got := datasets[0][0]; got != 120 {
|
||||
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||
}
|
||||
if got := datasets[0][2]; got != 130 {
|
||||
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 7, PowerW: 170},
|
||||
{GPUIndex: 2, PowerW: 120},
|
||||
{GPUIndex: 0, PowerW: 100},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: 101},
|
||||
{GPUIndex: 7, PowerW: 171},
|
||||
{GPUIndex: 2, PowerW: 121},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||
if len(names) != len(wantNames) {
|
||||
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||
}
|
||||
for i := range wantNames {
|
||||
if names[i] != wantNames[i] {
|
||||
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||
}
|
||||
}
|
||||
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||
}
|
||||
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||
}
|
||||
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||
want := []float64{0, 480, 480, 480, 510, 510}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||
body := renderMetrics()
|
||||
if !strings.Contains(body, "const probe = new Image();") {
|
||||
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartLegendVisible(t *testing.T) {
|
||||
if !chartLegendVisible(8) {
|
||||
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||
}
|
||||
if chartLegendVisible(9) {
|
||||
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartYAxisNumber(t *testing.T) {
|
||||
tests := []struct {
|
||||
in float64
|
||||
want string
|
||||
}{
|
||||
{in: 999, want: "999"},
|
||||
{in: 1000, want: "1к"},
|
||||
{in: 1370, want: "1,4к"},
|
||||
{in: 1500, want: "1,5к"},
|
||||
{in: 1700, want: "1,7к"},
|
||||
{in: 2000, want: "2к"},
|
||||
{in: 9999, want: "10к"},
|
||||
{in: 10200, want: "10к"},
|
||||
{in: -1500, want: "-1,5к"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartCanvasHeight(t *testing.T) {
|
||||
if got := chartCanvasHeight(4); got != 360 {
|
||||
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||
}
|
||||
if got := chartCanvasHeight(12); got != 288 {
|
||||
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartYAxisOption(t *testing.T) {
|
||||
min := floatPtr(0)
|
||||
max := floatPtr(100)
|
||||
opt := chartYAxisOption(min, max)
|
||||
if opt.Min != min || opt.Max != max {
|
||||
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
||||
}
|
||||
if opt.LabelCount != 11 {
|
||||
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
||||
}
|
||||
if got := opt.ValueFormatter(1000); got != "1к" {
|
||||
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||
r1 := newMetricsRing(4)
|
||||
r2 := newMetricsRing(4)
|
||||
r1.push(1000)
|
||||
r1.push(1100)
|
||||
r2.push(1200)
|
||||
r2.push(1300)
|
||||
|
||||
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||
if len(datasets) != 2 {
|
||||
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||
}
|
||||
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != 2 {
|
||||
t.Fatalf("labels=%v want 2 entries", labels)
|
||||
}
|
||||
if labels[0] == "" || labels[1] == "" {
|
||||
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||
body := renderNetworkInline()
|
||||
if !strings.Contains(body, "d.pending_change") {
|
||||
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
@@ -21,9 +275,10 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
Title: "Bee Hardware Audit",
|
||||
BuildLabel: "1.2.3",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
first := httptest.NewRecorder()
|
||||
@@ -31,11 +286,17 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
||||
if first.Code != http.StatusOK {
|
||||
t.Fatalf("first status=%d", first.Code)
|
||||
}
|
||||
if !strings.Contains(first.Body.String(), `iframe`) || !strings.Contains(first.Body.String(), `src="/viewer"`) {
|
||||
t.Fatalf("first body missing iframe viewer: %s", first.Body.String())
|
||||
// Dashboard should contain the audit nav link and hardware summary
|
||||
if !strings.Contains(first.Body.String(), `href="/audit"`) {
|
||||
t.Fatalf("first body missing audit nav link: %s", first.Body.String())
|
||||
}
|
||||
if !strings.Contains(first.Body.String(), "/export/support.tar.gz") {
|
||||
t.Fatalf("first body missing support bundle link: %s", first.Body.String())
|
||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||
}
|
||||
versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
|
||||
navIdx := strings.Index(first.Body.String(), `href="/"`)
|
||||
if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
|
||||
t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
|
||||
}
|
||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||
t.Fatalf("first cache-control=%q", got)
|
||||
@@ -50,8 +311,135 @@ func TestRootRendersShellWithIframe(t *testing.T) {
|
||||
if second.Code != http.StatusOK {
|
||||
t.Fatalf("second status=%d", second.Code)
|
||||
}
|
||||
if !strings.Contains(second.Body.String(), `src="/viewer"`) {
|
||||
t.Fatalf("second body missing iframe viewer: %s", second.Body.String())
|
||||
if !strings.Contains(second.Body.String(), `Hardware Summary`) {
|
||||
t.Fatalf("second body missing hardware summary: %s", second.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: filepath.Join(dir, "missing-audit.json"),
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Run Audit`) {
|
||||
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||
}
|
||||
if strings.Contains(body, `No audit data`) {
|
||||
t.Fatalf("dashboard still shows empty audit badge: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `iframe class="viewer-frame" src="/viewer"`) {
|
||||
t.Fatalf("audit page missing viewer frame: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `openAuditModal()`) {
|
||||
t.Fatalf("audit page missing action modal trigger: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `id="task-log-overlay"`) {
|
||||
t.Fatalf("tasks page missing log modal overlay: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
|
||||
t.Fatalf("tasks page missing pagination controls: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Export to USB`) {
|
||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersScrollableLogModal(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `height:calc(100vh - 32px)`) {
|
||||
t.Fatalf("tasks page missing bounded log modal height: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
|
||||
t.Fatalf("tasks page missing log modal overflow guard: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
|
||||
t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,8 +491,8 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if got := strings.TrimSpace(rec.Body.String()); got != body {
|
||||
t.Fatalf("body=%q want %q", got, body)
|
||||
if !strings.Contains(rec.Body.String(), "SERIAL-API") {
|
||||
t.Fatalf("body missing expected serial: %s", rec.Body.String())
|
||||
}
|
||||
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
||||
t.Fatalf("content-type=%q", got)
|
||||
@@ -129,6 +517,17 @@ func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
archive, err := os.CreateTemp(os.TempDir(), "bee-support-server-test-*.tar.gz")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Cleanup(func() { _ = os.Remove(archive.Name()) })
|
||||
if _, err := archive.WriteString("support-bundle"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := archive.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
979
audit/internal/webui/tasks.go
Normal file
979
audit/internal/webui/tasks.go
Normal file
@@ -0,0 +1,979 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// Task statuses.
|
||||
const (
|
||||
TaskPending = "pending"
|
||||
TaskRunning = "running"
|
||||
TaskDone = "done"
|
||||
TaskFailed = "failed"
|
||||
TaskCancelled = "cancelled"
|
||||
)
|
||||
|
||||
// taskNames maps target → human-readable name for validate (SAT) runs.
|
||||
var taskNames = map[string]string{
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"nvidia-stress": "NVIDIA GPU Stress",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"platform-stress": "Platform Thermal Cycling",
|
||||
"audit": "Audit",
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
var burnNames = map[string]string{
|
||||
"nvidia": "NVIDIA Burn-in",
|
||||
"memory": "Memory Burn-in",
|
||||
"cpu": "CPU Burn-in",
|
||||
"amd": "AMD GPU Burn-in",
|
||||
}
|
||||
|
||||
func nvidiaStressTaskName(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case platform.NvidiaStressLoaderJohn:
|
||||
return "NVIDIA GPU Stress (John/OpenCL)"
|
||||
case platform.NvidiaStressLoaderNCCL:
|
||||
return "NVIDIA GPU Stress (NCCL)"
|
||||
default:
|
||||
return "NVIDIA GPU Stress (bee-gpu-burn)"
|
||||
}
|
||||
}
|
||||
|
||||
func taskDisplayName(target, profile, loader string) string {
|
||||
name := taskNames[target]
|
||||
if profile != "" {
|
||||
if n, ok := burnNames[target]; ok {
|
||||
name = n
|
||||
}
|
||||
}
|
||||
if target == "nvidia-stress" {
|
||||
name = nvidiaStressTaskName(loader)
|
||||
}
|
||||
if name == "" {
|
||||
name = target
|
||||
}
|
||||
return name
|
||||
}
|
||||
|
||||
// Task represents one unit of work in the queue.
|
||||
type Task struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
params taskParams
|
||||
}
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
type taskParams struct {
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
Params taskParams `json:"params,omitempty"`
|
||||
}
|
||||
|
||||
type burnPreset struct {
|
||||
NvidiaDiag int
|
||||
DurationSec int
|
||||
}
|
||||
|
||||
func resolveBurnPreset(profile string) burnPreset {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
|
||||
case "acceptance":
|
||||
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
|
||||
default:
|
||||
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
|
||||
}
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
}}
|
||||
case "acceptance":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
}}
|
||||
default: // smoke
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 90, IdleSec: 60},
|
||||
{LoadSec: 90, IdleSec: 30},
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
kmsgWatcher *kmsgWatcher
|
||||
}
|
||||
|
||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
|
||||
const maxTaskHistory = 50
|
||||
|
||||
var (
|
||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runNvidiaStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
runMemoryStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
runSATStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
buildSupportBundle = app.BuildSupportBundle
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||
}
|
||||
)
|
||||
|
||||
// enqueue adds a task to the queue and notifies the worker.
|
||||
func (q *taskQueue) enqueue(t *Task) {
|
||||
q.mu.Lock()
|
||||
q.assignTaskLogPathLocked(t)
|
||||
q.tasks = append(q.tasks, t)
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// prune removes oldest completed tasks beyond maxTaskHistory.
|
||||
func (q *taskQueue) prune() {
|
||||
var done []*Task
|
||||
var active []*Task
|
||||
for _, t := range q.tasks {
|
||||
switch t.Status {
|
||||
case TaskDone, TaskFailed, TaskCancelled:
|
||||
done = append(done, t)
|
||||
default:
|
||||
active = append(active, t)
|
||||
}
|
||||
}
|
||||
if len(done) > maxTaskHistory {
|
||||
done = done[len(done)-maxTaskHistory:]
|
||||
}
|
||||
q.tasks = append(active, done...)
|
||||
}
|
||||
|
||||
// nextPending returns the highest-priority pending task (nil if none).
|
||||
func (q *taskQueue) nextPending() *Task {
|
||||
var best *Task
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskPending {
|
||||
continue
|
||||
}
|
||||
if best == nil || t.Priority > best.Priority ||
|
||||
(t.Priority == best.Priority && t.CreatedAt.Before(best.CreatedAt)) {
|
||||
best = t
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
// findByID looks up a task by ID.
|
||||
func (q *taskQueue) findByID(id string) (*Task, bool) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.ID == id {
|
||||
return t, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// findJob returns the jobState for a task ID (for SSE streaming compatibility).
|
||||
func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
||||
t, ok := q.findByID(id)
|
||||
if !ok || t.job == nil {
|
||||
return nil, false
|
||||
}
|
||||
return t.job, true
|
||||
}
|
||||
|
||||
type taskStreamSource struct {
|
||||
status string
|
||||
errMsg string
|
||||
logPath string
|
||||
job *jobState
|
||||
}
|
||||
|
||||
func (q *taskQueue) taskStreamSource(id string) (taskStreamSource, bool) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.ID != id {
|
||||
continue
|
||||
}
|
||||
return taskStreamSource{
|
||||
status: t.Status,
|
||||
errMsg: t.ErrMsg,
|
||||
logPath: t.LogPath,
|
||||
job: t.job,
|
||||
}, true
|
||||
}
|
||||
return taskStreamSource{}, false
|
||||
}
|
||||
|
||||
func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.Target != target {
|
||||
continue
|
||||
}
|
||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// snapshot returns a copy of all tasks sorted for display with newest tasks first.
|
||||
func (q *taskQueue) snapshot() []Task {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
out := make([]Task, len(q.tasks))
|
||||
for i, t := range q.tasks {
|
||||
out[i] = *t
|
||||
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
|
||||
return out[i].CreatedAt.After(out[j].CreatedAt)
|
||||
}
|
||||
si := statusOrder(out[i].Status)
|
||||
sj := statusOrder(out[j].Status)
|
||||
if si != sj {
|
||||
return si < sj
|
||||
}
|
||||
if out[i].Priority != out[j].Priority {
|
||||
return out[i].Priority > out[j].Priority
|
||||
}
|
||||
return out[i].Name < out[j].Name
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
func statusOrder(s string) int {
|
||||
switch s {
|
||||
case TaskRunning:
|
||||
return 0
|
||||
case TaskPending:
|
||||
return 1
|
||||
default:
|
||||
return 2
|
||||
}
|
||||
}
|
||||
|
||||
// startWorker launches the queue runner goroutine.
|
||||
func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
q.mu.Lock()
|
||||
q.opts = opts
|
||||
q.statePath = filepath.Join(opts.ExportDir, "tasks-state.json")
|
||||
q.logsDir = filepath.Join(opts.ExportDir, "tasks")
|
||||
_ = os.MkdirAll(q.logsDir, 0755)
|
||||
if !q.started {
|
||||
q.loadLocked()
|
||||
q.started = true
|
||||
go q.worker()
|
||||
}
|
||||
hasPending := q.nextPending() != nil
|
||||
q.mu.Unlock()
|
||||
if hasPending {
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) worker() {
|
||||
for {
|
||||
<-q.trigger
|
||||
setCPUGovernor("performance")
|
||||
|
||||
// Drain all pending tasks and start them in parallel.
|
||||
q.mu.Lock()
|
||||
var batch []*Task
|
||||
for {
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
break
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskRunning
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath)
|
||||
t.job = j
|
||||
batch = append(batch, t)
|
||||
}
|
||||
if len(batch) > 0 {
|
||||
q.persistLocked()
|
||||
}
|
||||
q.mu.Unlock()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range batch {
|
||||
t := t
|
||||
j := t.job
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
}
|
||||
|
||||
q.runTask(t, j, taskCtx)
|
||||
|
||||
if q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
if t.Status == TaskRunning {
|
||||
if j.err != "" {
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = j.err
|
||||
} else {
|
||||
t.Status = TaskDone
|
||||
}
|
||||
}
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if len(batch) > 0 {
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}
|
||||
|
||||
setCPUGovernor("powersave")
|
||||
}
|
||||
}
|
||||
|
||||
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
||||
// Silently ignores errors (e.g. when cpufreq is not available).
|
||||
func setCPUGovernor(governor string) {
|
||||
matches, err := filepath.Glob("/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor")
|
||||
if err != nil || len(matches) == 0 {
|
||||
return
|
||||
}
|
||||
for _, path := range matches {
|
||||
_ = os.WriteFile(path, []byte(governor), 0644)
|
||||
}
|
||||
}
|
||||
|
||||
// runTask executes the work for a task, writing output to j.
|
||||
func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if q.opts == nil {
|
||||
j.append("ERROR: handler options not configured")
|
||||
j.finish("handler options not configured")
|
||||
return
|
||||
}
|
||||
a := q.opts.App
|
||||
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if len(j.lines) > 0 {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
|
||||
switch t.Target {
|
||||
case "nvidia":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
diagLevel := t.params.DiagLevel
|
||||
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
||||
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
||||
}
|
||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||
)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
archive = result.Body
|
||||
}
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "memory-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "sat-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
opts.Components = t.params.PlatformComponents
|
||||
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||
case "audit":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
for _, line := range splitLines(result.Body) {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
case "support-bundle":
|
||||
j.append("Building support bundle...")
|
||||
archive, err = buildSupportBundle(q.opts.ExportDir)
|
||||
case "install":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||
j.append("Install log: " + installLogPath)
|
||||
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||
case "install-to-ram":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
return
|
||||
}
|
||||
|
||||
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil {
|
||||
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
}
|
||||
if db := q.statusDB(); db != nil {
|
||||
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
j.finish("aborted")
|
||||
} else {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
if archive != "" {
|
||||
j.append("Archive: " + archive)
|
||||
}
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
return nil
|
||||
}
|
||||
return q.opts.App.StatusDB
|
||||
}
|
||||
|
||||
func splitLines(s string) []string {
|
||||
var out []string
|
||||
for _, l := range splitNL(s) {
|
||||
if l != "" {
|
||||
out = append(out, l)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func splitNL(s string) []string {
|
||||
var out []string
|
||||
start := 0
|
||||
for i, c := range s {
|
||||
if c == '\n' {
|
||||
out = append(out, s[start:i])
|
||||
start = i + 1
|
||||
}
|
||||
}
|
||||
out = append(out, s[start:])
|
||||
return out
|
||||
}
|
||||
|
||||
// ── HTTP handlers ─────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPITasksList(w http.ResponseWriter, _ *http.Request) {
|
||||
tasks := globalQueue.snapshot()
|
||||
writeJSON(w, tasks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
t, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
writeError(w, http.StatusNotFound, "task not found")
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
default:
|
||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksPriority(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
t, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
writeError(w, http.StatusNotFound, "task not found")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Delta int `json:"delta"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid body")
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if t.Status != TaskPending {
|
||||
writeError(w, http.StatusConflict, "only pending tasks can be reprioritised")
|
||||
return
|
||||
}
|
||||
t.Priority += req.Delta
|
||||
globalQueue.persistLocked()
|
||||
writeJSON(w, map[string]int{"priority": t.Priority})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request) {
|
||||
globalQueue.mu.Lock()
|
||||
now := time.Now()
|
||||
n := 0
|
||||
for _, t := range globalQueue.tasks {
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
n++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
n++
|
||||
}
|
||||
}
|
||||
globalQueue.persistLocked()
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]int{"cancelled": n})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Request) {
|
||||
// Cancel all queued/running tasks in the queue first.
|
||||
globalQueue.mu.Lock()
|
||||
now := time.Now()
|
||||
cancelled := 0
|
||||
for _, t := range globalQueue.tasks {
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
}
|
||||
}
|
||||
globalQueue.persistLocked()
|
||||
globalQueue.mu.Unlock()
|
||||
|
||||
// Kill orphaned test worker processes at the OS level.
|
||||
killed := platform.KillTestWorkers()
|
||||
writeJSON(w, map[string]any{
|
||||
"cancelled": cancelled,
|
||||
"killed": len(killed),
|
||||
"processes": killed,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
src, ok := globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
http.Error(w, "task not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
if src.job != nil {
|
||||
streamJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
sseWrite(w, "", "Task is queued. Waiting for worker...")
|
||||
ticker := time.NewTicker(200 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
src, ok = globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
sseWrite(w, "done", "task not found")
|
||||
return
|
||||
}
|
||||
if src.job != nil {
|
||||
streamSubscribedJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamSubscribedJob(w, r, j)
|
||||
return
|
||||
}
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||
if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
|
||||
return
|
||||
}
|
||||
t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
|
||||
}
|
||||
|
||||
func (q *taskQueue) loadLocked() {
|
||||
if q.statePath == "" {
|
||||
return
|
||||
}
|
||||
data, err := os.ReadFile(q.statePath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted []persistedTask
|
||||
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||
return
|
||||
}
|
||||
for _, pt := range persisted {
|
||||
t := &Task{
|
||||
ID: pt.ID,
|
||||
Name: pt.Name,
|
||||
Target: pt.Target,
|
||||
Priority: pt.Priority,
|
||||
Status: pt.Status,
|
||||
CreatedAt: pt.CreatedAt,
|
||||
StartedAt: pt.StartedAt,
|
||||
DoneAt: pt.DoneAt,
|
||||
ErrMsg: pt.ErrMsg,
|
||||
LogPath: pt.LogPath,
|
||||
params: pt.Params,
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker) survive the restart in their own
|
||||
// process groups and cannot be cancelled retroactively. Mark the
|
||||
// task as failed so the user can decide whether to re-run it
|
||||
// rather than blindly re-launching duplicate workers.
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
} else if t.Status == TaskPending {
|
||||
t.StartedAt = nil
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
q.tasks = append(q.tasks, t)
|
||||
}
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
}
|
||||
|
||||
func (q *taskQueue) persistLocked() {
|
||||
if q.statePath == "" {
|
||||
return
|
||||
}
|
||||
state := make([]persistedTask, 0, len(q.tasks))
|
||||
for _, t := range q.tasks {
|
||||
state = append(state, persistedTask{
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Priority: t.Priority,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
ErrMsg: t.ErrMsg,
|
||||
LogPath: t.LogPath,
|
||||
Params: t.params,
|
||||
})
|
||||
}
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tmp := q.statePath + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.Rename(tmp, q.statePath)
|
||||
}
|
||||
|
||||
func taskElapsedSec(t *Task, now time.Time) int {
|
||||
if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
|
||||
return 0
|
||||
}
|
||||
start := *t.StartedAt
|
||||
if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
|
||||
start = t.CreatedAt
|
||||
}
|
||||
end := now
|
||||
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
end = *t.DoneAt
|
||||
}
|
||||
if end.Before(start) {
|
||||
return 0
|
||||
}
|
||||
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||
}
|
||||
469
audit/internal/webui/tasks_test.go
Normal file
469
audit/internal/webui/tasks_test.go
Normal file
@@ -0,0 +1,469 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
)
|
||||
|
||||
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
started := time.Now().Add(-time.Minute)
|
||||
|
||||
// A task that was pending (not yet started) must be re-queued on restart.
|
||||
pendingTask := &Task{
|
||||
ID: "task-pending",
|
||||
Name: "Memory Burn-in",
|
||||
Target: "memory-stress",
|
||||
Priority: 2,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||
params: taskParams{Duration: 300, BurnProfile: "smoke"},
|
||||
}
|
||||
// A task that was running when bee-web crashed must NOT be re-queued —
|
||||
// its child processes (e.g. gpu-burn-worker) survive the restart in
|
||||
// their own process groups and can't be cancelled retroactively.
|
||||
runningTask := &Task{
|
||||
ID: "task-running",
|
||||
Name: "NVIDIA GPU Stress",
|
||||
Target: "nvidia-stress",
|
||||
Priority: 1,
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now().Add(-3 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{Duration: 86400},
|
||||
}
|
||||
for _, task := range []*Task{pendingTask, runningTask} {
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
}
|
||||
q.persistLocked()
|
||||
|
||||
recovered := &taskQueue{
|
||||
statePath: q.statePath,
|
||||
logsDir: q.logsDir,
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
recovered.loadLocked()
|
||||
|
||||
if len(recovered.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(recovered.tasks))
|
||||
}
|
||||
|
||||
byID := map[string]*Task{}
|
||||
for i := range recovered.tasks {
|
||||
byID[recovered.tasks[i].ID] = recovered.tasks[i]
|
||||
}
|
||||
|
||||
// Pending task must be re-queued as pending with params intact.
|
||||
p := byID["task-pending"]
|
||||
if p == nil {
|
||||
t.Fatal("task-pending not found")
|
||||
}
|
||||
if p.Status != TaskPending {
|
||||
t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
|
||||
}
|
||||
if p.StartedAt != nil {
|
||||
t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
|
||||
}
|
||||
if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("pending task: params=%+v", p.params)
|
||||
}
|
||||
if p.LogPath == "" {
|
||||
t.Fatal("pending task: expected log path")
|
||||
}
|
||||
|
||||
// Running task must be marked failed, not re-queued, to prevent
|
||||
// launching duplicate workers (e.g. a second set of gpu-burn-workers).
|
||||
r := byID["task-running"]
|
||||
if r == nil {
|
||||
t.Fatal("task-running not found")
|
||||
}
|
||||
if r.Status != TaskFailed {
|
||||
t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
|
||||
}
|
||||
if r.ErrMsg == "" {
|
||||
t.Fatal("running task: expected non-empty error message")
|
||||
}
|
||||
if r.DoneAt == nil {
|
||||
t.Fatal("running task: expected done_at to be set")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(path, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
j := newTaskJobState(path)
|
||||
existing, ch := j.subscribe()
|
||||
if ch == nil {
|
||||
t.Fatal("expected live subscription channel")
|
||||
}
|
||||
if len(existing) != 2 || existing[0] != "line1" || existing[1] != "line2" {
|
||||
t.Fatalf("existing=%v", existing)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
tasks: []*Task{
|
||||
{
|
||||
ID: "old-running",
|
||||
Name: "Old Running",
|
||||
Status: TaskRunning,
|
||||
Priority: 10,
|
||||
CreatedAt: now.Add(-3 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "new-done",
|
||||
Name: "New Done",
|
||||
Status: TaskDone,
|
||||
Priority: 0,
|
||||
CreatedAt: now.Add(-1 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "mid-pending",
|
||||
Name: "Mid Pending",
|
||||
Status: TaskPending,
|
||||
Priority: 1,
|
||||
CreatedAt: now.Add(-2 * time.Minute),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := q.snapshot()
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("snapshot len=%d want 3", len(got))
|
||||
}
|
||||
if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
|
||||
t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
logPath := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "done-1",
|
||||
Name: "Done Task",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
LogPath: logPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
|
||||
req.SetPathValue("id", "done-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
|
||||
t.Fatalf("body=%q", body)
|
||||
}
|
||||
if !strings.Contains(body, "event: done\n") {
|
||||
t.Fatalf("missing done event: %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "pending-1",
|
||||
Name: "Pending Task",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
|
||||
req.SetPathValue("id", "pending-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
|
||||
cancel()
|
||||
<-done
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
return
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
cancel()
|
||||
<-done
|
||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
want burnPreset
|
||||
}{
|
||||
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
||||
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
||||
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||
t.Fatalf("resolveBurnPreset(%q)=%+v want %+v", tc.profile, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
|
||||
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
|
||||
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
blocked := make(chan struct{})
|
||||
released := make(chan struct{})
|
||||
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
close(blocked)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
close(released)
|
||||
return "", ctx.Err()
|
||||
case <-time.After(5 * time.Second):
|
||||
close(released)
|
||||
return "unexpected", nil
|
||||
}
|
||||
}
|
||||
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{Duration: 60},
|
||||
}
|
||||
j := &jobState{}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
j.cancel = cancel
|
||||
tk.job = j
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return aRun(nil, ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
q.runTask(tk, j, ctx)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
<-blocked
|
||||
j.abort()
|
||||
|
||||
select {
|
||||
case <-released:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("task did not observe cancel")
|
||||
}
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("runTask did not return after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
var gotDuration int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-burn-1",
|
||||
Name: "CPU Burn-in",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{BurnProfile: "smoke"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
|
||||
gotDuration = durationSec
|
||||
return "/tmp/cpu-burn.tar.gz", nil
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDuration != 5*60 {
|
||||
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{ExportDir: dir},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "support-bundle-1",
|
||||
Name: "Support Bundle",
|
||||
Target: "support-bundle",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotExportDir string
|
||||
orig := buildSupportBundle
|
||||
buildSupportBundle = func(exportDir string) (string, error) {
|
||||
gotExportDir = exportDir
|
||||
return filepath.Join(exportDir, "bundle.tar.gz"), nil
|
||||
}
|
||||
defer func() { buildSupportBundle = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotExportDir != dir {
|
||||
t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
|
||||
t.Fatalf("lines=%v", j.lines)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||
started := time.Time{}
|
||||
task := &Task{
|
||||
Status: TaskRunning,
|
||||
CreatedAt: created,
|
||||
StartedAt: &started,
|
||||
}
|
||||
if got := taskElapsedSec(task, now); got != 0 {
|
||||
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||
}
|
||||
|
||||
stale := created.Add(-24 * time.Hour)
|
||||
task.StartedAt = &stale
|
||||
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "install-1",
|
||||
Name: "Install to Disk",
|
||||
Target: "install",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{Device: "/dev/sda"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotDevice string
|
||||
var gotLogPath string
|
||||
orig := installCommand
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
gotDevice = device
|
||||
gotLogPath = logPath
|
||||
return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
|
||||
}
|
||||
defer func() { installCommand = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDevice != "/dev/sda" {
|
||||
t.Fatalf("device=%q want /dev/sda", gotDevice)
|
||||
}
|
||||
if gotLogPath == "" {
|
||||
t.Fatal("expected install log path")
|
||||
}
|
||||
logs := strings.Join(j.lines, "\n")
|
||||
if !strings.Contains(logs, "Install log: ") {
|
||||
t.Fatalf("missing install log line: %v", j.lines)
|
||||
}
|
||||
if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
|
||||
t.Fatalf("missing streamed output: %v", j.lines)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
}
|
||||
16
audit/scripts/resolve-version.sh
Executable file
16
audit/scripts/resolve-version.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
|
||||
case "${tag}" in
|
||||
v*)
|
||||
printf '%s\n' "${tag#v}"
|
||||
;;
|
||||
"")
|
||||
printf 'dev\n'
|
||||
;;
|
||||
*)
|
||||
printf '%s\n' "${tag}"
|
||||
;;
|
||||
esac
|
||||
67
bible-local/architecture/charting.md
Normal file
67
bible-local/architecture/charting.md
Normal file
@@ -0,0 +1,67 @@
|
||||
# Charting architecture
|
||||
|
||||
## Decision: one chart engine for all live metrics
|
||||
|
||||
**Engine:** `github.com/go-analyze/charts` (pure Go, no CGO, SVG output)
|
||||
**Theme:** `grafana` (dark background, coloured lines)
|
||||
|
||||
All live metrics charts in the web UI are server-side SVG images served by Go
|
||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||
There is no client-side canvas or JS chart library.
|
||||
|
||||
## Rule: live charts must be visually uniform
|
||||
|
||||
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||
changes to existing charts must keep the same rendering model and presentation
|
||||
rules unless there is an explicit architectural decision to diverge.
|
||||
|
||||
Default expectations:
|
||||
|
||||
- same server-side SVG pipeline for all live metrics charts
|
||||
- same refresh behaviour and failure handling in the browser
|
||||
- same canvas size class and card layout
|
||||
- same legend placement policy across charts
|
||||
- same axis, title, and summary conventions
|
||||
- no chart-specific visual exceptions added as a quick fix
|
||||
|
||||
Current default for live charts:
|
||||
|
||||
- legend below the plot area when a chart has 8 series or fewer
|
||||
- legend hidden when a chart has more than 8 series
|
||||
- 10 equal Y-axis steps across the chart height
|
||||
- 1400 x 360 SVG canvas with legend
|
||||
- 1400 x 288 SVG canvas without legend
|
||||
- full-width card rendering in a single-column stack
|
||||
|
||||
If one chart needs a different layout or legend behaviour, treat that as a
|
||||
design-level decision affecting the whole chart family, not as a local tweak to
|
||||
just one endpoint.
|
||||
|
||||
### Why go-analyze/charts
|
||||
|
||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||
- SVG output — crisp at any display resolution, full-width without pixelation
|
||||
- Grafana theme matches the dark web UI colour scheme
|
||||
- Active fork of the archived wcharczuk/go-chart
|
||||
|
||||
### SAT stress-test charts
|
||||
|
||||
The `drawGPUChartSVG` function in `platform/gpu_metrics.go` is a separate
|
||||
self-contained SVG renderer used **only** for completed SAT run reports
|
||||
(HTML export, burn-in summaries). It is not used for live metrics.
|
||||
|
||||
### Live metrics chart endpoints
|
||||
|
||||
| Path | Content |
|
||||
|------|---------|
|
||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||
|
||||
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||
the legend is hidden. The page renders them at `width: 100%` in a
|
||||
single-column layout so they always fill the viewport width.
|
||||
|
||||
### Ring buffers
|
||||
|
||||
Each metric is stored in a 120-sample ring buffer (2 minutes of history at 1 Hz).
|
||||
Buffers are per-server or per-GPU and grow dynamically as new GPUs appear.
|
||||
@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
|
||||
|
||||
## Boot sequence (single ISO)
|
||||
|
||||
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
|
||||
|
||||
`systemd` boot order:
|
||||
|
||||
```
|
||||
@@ -20,11 +22,12 @@ local-fs.target
|
||||
│ creates /dev/nvidia* nodes)
|
||||
├── bee-audit.service (runs `bee audit` → /var/log/bee-audit.json,
|
||||
│ never blocks boot on partial collector failures)
|
||||
└── bee-web.service (runs `bee web` on :80,
|
||||
reads the latest audit snapshot on each request)
|
||||
├── bee-web.service (runs `bee web` on :80 — full interactive web UI)
|
||||
└── bee-desktop.service (startx → openbox + chromium http://localhost/)
|
||||
```
|
||||
|
||||
**Critical invariants:**
|
||||
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
|
||||
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
|
||||
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
|
||||
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
|
||||
@@ -41,18 +44,24 @@ Local-console behavior:
|
||||
```text
|
||||
tty1
|
||||
└── live-config autologin → bee
|
||||
└── /home/bee/.profile
|
||||
└── exec menu
|
||||
└── /usr/local/bin/bee-tui
|
||||
└── sudo -n /usr/local/bin/bee tui --runtime livecd
|
||||
└── /home/bee/.profile (prints web UI URLs)
|
||||
|
||||
display :0
|
||||
└── bee-desktop.service (User=bee)
|
||||
└── startx /usr/local/bin/bee-openbox-session -- :0
|
||||
├── tint2 (taskbar)
|
||||
├── chromium http://localhost/
|
||||
└── openbox (WM)
|
||||
```
|
||||
|
||||
Rules:
|
||||
- local `tty1` lands in user `bee`, not directly in `root`
|
||||
- `menu` must work without typing `sudo`
|
||||
- TUI actions still run as `root` via `sudo -n`
|
||||
- SSH is independent from the tty1 path
|
||||
- `bee-desktop.service` starts X11 + openbox + Chromium automatically after `bee-web.service`
|
||||
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||
- SSH is independent from the desktop path
|
||||
- serial console support is enabled for VM boot debugging
|
||||
- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
|
||||
- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
|
||||
|
||||
## ISO build sequence
|
||||
|
||||
@@ -71,24 +80,39 @@ build-in-container.sh [--authorized-keys /path/to/keys]
|
||||
d. build kernel modules against Debian headers
|
||||
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
|
||||
f. cache in `dist/nvidia-<version>-<kver>/`
|
||||
7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||
8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||
9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
|
||||
10. write staged `/etc/bee-release` (versions + git commit)
|
||||
11. patch staged `motd` with build metadata
|
||||
12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||
13. sync staged overlay into workdir `config/includes.chroot/`
|
||||
14. run `lb config && lb build` inside the privileged builder container
|
||||
7. `build-cublas.sh`:
|
||||
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||
b. verify packages against repo `Packages.gz`
|
||||
c. extract headers for `bee-gpu-burn` worker build
|
||||
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||
8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
|
||||
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||
12. write staged `/etc/bee-release` (versions + git commit)
|
||||
13. patch staged `motd` with build metadata
|
||||
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||
15. sync staged overlay into workdir `config/includes.chroot/`
|
||||
16. run `lb config && lb build` inside the privileged builder container
|
||||
```
|
||||
|
||||
Build host notes:
|
||||
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
|
||||
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
|
||||
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
|
||||
|
||||
**Critical invariants:**
|
||||
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
|
||||
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||
- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
|
||||
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
|
||||
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
|
||||
|
||||
## Post-boot smoke test
|
||||
|
||||
@@ -104,7 +128,7 @@ Key checks: NVIDIA modules loaded, `nvidia-smi` sees all GPUs, lib symlinks pres
|
||||
systemd services running, audit completed with NVIDIA enrichment, LAN reachability.
|
||||
|
||||
Current validation state:
|
||||
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and TUI startup
|
||||
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and Web UI startup
|
||||
- real hardware validation is still required before treating the ISO as release-ready
|
||||
|
||||
## Overlay mechanism
|
||||
@@ -131,43 +155,31 @@ Current validation state:
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
Acceptance flows:
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
|
||||
- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
|
||||
- `bee sat memory` → `memtester` archive
|
||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||
- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||
- Ada / Hopper: add `fp8`
|
||||
- Blackwell+: add `fp4`
|
||||
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||
- Runtime overrides:
|
||||
- `BEE_GPU_STRESS_SECONDS`
|
||||
- `BEE_GPU_STRESS_SIZE_MB`
|
||||
- `BEE_MEMTESTER_SIZE_MB`
|
||||
- `BEE_MEMTESTER_PASSES`
|
||||
|
||||
## NVIDIA SAT TUI flow (v1.0.0+)
|
||||
## NVIDIA SAT Web UI flow
|
||||
|
||||
```
|
||||
TUI: Acceptance tests → NVIDIA command pack
|
||||
1. screenNvidiaSATSetup
|
||||
a. enumerate GPUs via `nvidia-smi --query-gpu=index,name,memory.total`
|
||||
b. user selects duration preset: 10 min / 1 h / 8 h / 24 h
|
||||
c. user selects GPUs via checkboxes (all selected by default)
|
||||
d. memory size = max(selected GPU memory) — auto-detected, not exposed to user
|
||||
2. Start → screenNvidiaSATRunning
|
||||
a. CUDA_VISIBLE_DEVICES set to selected GPU indices
|
||||
b. tea.Batch: SAT goroutine + tea.ExecProcess(nvtop) launched concurrently
|
||||
c. nvtop occupies full terminal; SAT result queues in background
|
||||
d. [o] reopen nvtop at any time; [a] abort (cancels context → kills bee-gpu-stress)
|
||||
3. GPU metrics collection (during bee-gpu-stress)
|
||||
- background goroutine polls `nvidia-smi` every second
|
||||
- per-second rows: elapsed, GPU index, temp°C, usage%, power W, clock MHz
|
||||
- outputs: gpu-metrics.csv, gpu-metrics.html (offline SVG chart), gpu-metrics-term.txt
|
||||
4. After SAT completes
|
||||
- result shown in screenOutput with terminal line-chart (gpu-metrics-term.txt)
|
||||
- chart is asciigraph-style: box-drawing chars (╭╮╰╯─│), 4 series per GPU,
|
||||
Y axis with ticks, ANSI colours (red=temp, blue=usage, green=power, yellow=clock)
|
||||
Web UI: Acceptance Tests page → Run Test button
|
||||
1. POST /api/sat/nvidia/run → returns job_id
|
||||
2. GET /api/sat/stream?job_id=... (SSE) — streams stdout/stderr lines live
|
||||
3. After completion — archive written to /appdata/bee/export/bee-sat/
|
||||
summary.txt contains overall_status (OK / FAILED) and per-job status values
|
||||
```
|
||||
|
||||
**Critical invariants:**
|
||||
- `nvtop` must be in `iso/builder/config/package-lists/bee.list.chroot` (baked into ISO).
|
||||
- `bee-gpu-stress` uses `exec.CommandContext` — aborted on cancel.
|
||||
- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
|
||||
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||
- If `nvtop` is not found on PATH, SAT still runs without it (graceful degradation).
|
||||
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||
|
||||
@@ -21,13 +21,14 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||
- Machine-readable health summary derived from collector verdicts
|
||||
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
|
||||
- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||
- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||
- Automatic boot audit with operator-facing local console and SSH access
|
||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||
- SSH access (OpenSSH) always available for inspection and debugging
|
||||
- Interactive Go TUI via `bee tui` for network setup, service management, and acceptance tests
|
||||
- Read-only web viewer via `bee web`, rendering the latest audit snapshot through the embedded Reanimator Chart
|
||||
- Local `tty1` operator UX: `bee` autologin, `menu` auto-start, privileged actions via `sudo -n`
|
||||
- Full web UI via `bee web` on port 80: interactive control panel with live metrics, SAT tests, network config, service management, export, and tools
|
||||
- Local operator desktop: openbox + Xorg + Chromium auto-opening `http://localhost/`
|
||||
- Local `tty1` operator UX: `bee` autologin, openbox desktop auto-starts with Chromium on `http://localhost/`
|
||||
|
||||
## Network isolation — CRITICAL
|
||||
|
||||
@@ -69,15 +70,18 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| SSH | OpenSSH server |
|
||||
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||
| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||
|
||||
## Operator UX
|
||||
|
||||
- On the live ISO, `tty1` autologins as `bee`
|
||||
- The login profile auto-runs `menu`, which enters the Go TUI
|
||||
- The TUI itself executes privileged actions as `root` via `sudo -n`
|
||||
- `bee-desktop.service` starts X11 + openbox + Chromium on display `:0`
|
||||
- Chromium opens `http://localhost/` — the full web UI
|
||||
- SSH remains available independently of the local console path
|
||||
- Remote operators can open `http://<ip>/` in any browser on the same LAN
|
||||
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
|
||||
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
|
||||
|
||||
## Runtime split
|
||||
|
||||
@@ -85,6 +89,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- Live-ISO-only responsibilities stay in `iso/` integration code
|
||||
- Live ISO launches the Go CLI with `--runtime livecd`
|
||||
- Local/manual runs use `--runtime auto` or `--runtime local`
|
||||
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
|
||||
|
||||
## Key paths
|
||||
|
||||
@@ -99,7 +104,10 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
||||
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||
| `iso/overlay/etc/profile.d/bee.sh` | `menu` helper + tty1 auto-start policy |
|
||||
| `iso/overlay/home/bee/.profile` | `bee` shell profile for local console startup |
|
||||
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||
| `iso/overlay/home/bee/.profile` | `bee` shell profile (PATH only) |
|
||||
| `iso/overlay/etc/systemd/system/bee-desktop.service` | starts X11 + openbox + chromium |
|
||||
| `iso/overlay/usr/local/bin/bee-desktop` | startx wrapper for bee-desktop.service |
|
||||
| `iso/overlay/usr/local/bin/bee-openbox-session` | xinitrc: tint2 + chromium + openbox |
|
||||
| `dist/` | Build outputs (gitignored) |
|
||||
| `iso/out/` | Downloaded ISO files (gitignored) |
|
||||
|
||||
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
|
||||
- Kernel modules and nvidia-smi come from a single verified source.
|
||||
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
||||
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
||||
- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
|
||||
- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
|
||||
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
||||
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
||||
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
||||
|
||||
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
@@ -0,0 +1,224 @@
|
||||
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Status:** resolved
|
||||
|
||||
## Context
|
||||
|
||||
We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
|
||||
The commit history shows several distinct attempts:
|
||||
|
||||
- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
|
||||
- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
|
||||
- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
|
||||
- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
|
||||
- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
|
||||
- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
|
||||
|
||||
Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
|
||||
|
||||
- `lb binary_memtest` does run and installs `memtest86+`
|
||||
- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
|
||||
- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
|
||||
|
||||
So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
|
||||
|
||||
Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
|
||||
|
||||
- the build now completes successfully because memtest is non-blocking by default
|
||||
- `lb binary_memtest` still runs and installs `memtest86+`
|
||||
- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
|
||||
- but it executes too early for its current target paths:
|
||||
- `binary/boot/grub/grub.cfg` is still missing at hook time
|
||||
- `binary/isolinux/live.cfg` is still missing at hook time
|
||||
- memtest binaries are also still absent in `binary/boot/`
|
||||
- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
|
||||
- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
|
||||
|
||||
So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
|
||||
|
||||
Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
artifact dated 2026-04-01:
|
||||
|
||||
- the final ISO does contain `boot/memtest86+x64.bin`
|
||||
- the final ISO does contain `boot/memtest86+x64.efi`
|
||||
- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
|
||||
and `isolinux/live.cfg`
|
||||
- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
|
||||
shipped ISO
|
||||
- the regression was in the build-time validator/debug path in `build.sh`
|
||||
|
||||
Root cause of the false alarm:
|
||||
|
||||
- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
|
||||
successfully listed/extracted members"
|
||||
- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
|
||||
observable output as "memtest content missing"
|
||||
- this made a reader failure look identical to a missing memtest payload
|
||||
- as a result, we re-entered the same memtest investigation loop even though
|
||||
the real ISO was already correct
|
||||
|
||||
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||
|
||||
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||
`iso_memtest_present` return code of `1` as fatal
|
||||
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||
the recovery design itself was wrong
|
||||
|
||||
## Known Failed Attempts
|
||||
|
||||
These approaches were already tried and should not be repeated blindly:
|
||||
|
||||
1. Built-in live-build memtest only.
|
||||
Reason it failed:
|
||||
- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
|
||||
|
||||
2. Fixing only the memtest file names for Debian Bookworm.
|
||||
Reason it failed:
|
||||
- correct file names alone do not make the files appear in the final ISO.
|
||||
|
||||
3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
|
||||
Reason it failed:
|
||||
- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
|
||||
|
||||
4. Fallback extraction from cached `memtest86+` `.deb`.
|
||||
Reason it failed:
|
||||
- this was explored already and was not enough to stabilize the final ISO path end-to-end.
|
||||
|
||||
5. Restoring explicit memtest menu entries in source bootloader templates only.
|
||||
Reason it failed:
|
||||
- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
|
||||
|
||||
6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
|
||||
Reason it failed:
|
||||
- the hook runs before those files exist, so the hook cannot patch them there.
|
||||
|
||||
## What This Means
|
||||
|
||||
When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
|
||||
|
||||
- do not assume the built-in memtest stage is sufficient
|
||||
- do not assume `chroot/boot/` will contain memtest payloads
|
||||
- do not assume source bootloader templates are the last writer of final ISO configs
|
||||
- do not assume the current normal binary hook timing is late enough for final patching
|
||||
|
||||
Any future memtest fix must explicitly identify:
|
||||
|
||||
- where the memtest binaries are reliably available at build time
|
||||
- which exact build stage writes the final bootloader configs that land in the ISO
|
||||
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||
the validator printed a memtest warning
|
||||
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||
context rather than accidentally tripping `set -e`
|
||||
|
||||
## Decision
|
||||
|
||||
For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
|
||||
|
||||
Project rules from now on:
|
||||
|
||||
- Do **not** trust `--memtest memtest86+` by itself.
|
||||
- A memtest implementation is considered valid only if the produced ISO actually contains:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a GRUB menu entry
|
||||
- an isolinux menu entry
|
||||
- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
|
||||
- a binary hook copying files into `binary/boot/`
|
||||
- extraction from the cached `memtest86+` `.deb`
|
||||
- another deterministic build-time copy step
|
||||
- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
|
||||
|
||||
Current implementation direction:
|
||||
|
||||
- keep the live-build memtest stage enabled if it helps package acquisition
|
||||
- do not rely on the current early `binary_hooks` timing for final patching
|
||||
- prefer a post-`lb build` recovery step in `build.sh` that:
|
||||
- patches the fully materialized `LB_DIR/binary` tree
|
||||
- injects memtest binaries there
|
||||
- ensures final bootloader entries there
|
||||
- reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
|
||||
- also treat ISO validation tooling as part of the critical path:
|
||||
- install a stable ISO reader in the builder image
|
||||
- fail with an explicit reader error if ISO listing/extraction fails
|
||||
- do not treat reader failure as evidence that memtest is missing
|
||||
- do not call a probe that may return "needs recovery" as a bare command under
|
||||
`set -e`; wrap it in explicit control flow
|
||||
|
||||
## Consequences
|
||||
|
||||
- Future memtest changes must begin by reading this ADR and the commits listed above.
|
||||
- Future memtest changes must also begin by reading the failed-attempt list above.
|
||||
- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
|
||||
- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
|
||||
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||
"missing memtest" warning without a successful ISO read is not evidence.
|
||||
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||
|
||||
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||
|
||||
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||
|
||||
### Components
|
||||
|
||||
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||
|
||||
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||
those files may not exist yet. Instead:
|
||||
|
||||
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||
If they do not exist, the hook warns and continues (does not fail).
|
||||
|
||||
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||
|
||||
**2. Post-`lb build` recovery step in `build.sh`**
|
||||
|
||||
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||
contains all required memtest artifacts. If not:
|
||||
|
||||
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||
the ISO with the patched tree.
|
||||
|
||||
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||
|
||||
**3. ISO validation hardening**
|
||||
|
||||
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||
handled — it does not abort the build prematurely.
|
||||
|
||||
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||
|
||||
### Why this works when earlier attempts did not
|
||||
|
||||
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||
|
||||
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||
There is no ordering dependency to get wrong.
|
||||
|
||||
### Do not revert
|
||||
|
||||
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||
live-build alone produces all four required artifacts:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- memtest entry in `boot/grub/grub.cfg`
|
||||
- memtest entry in `isolinux/live.cfg`
|
||||
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
| Date | Decision | Status |
|
||||
|---|---|---|
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
|
||||
62
bible-local/docs/iso-build-rules.md
Normal file
62
bible-local/docs/iso-build-rules.md
Normal file
@@ -0,0 +1,62 @@
|
||||
# ISO Build Rules
|
||||
|
||||
## Verify package names before use
|
||||
|
||||
ISO builds take 30–60 minutes. A wrong package name wastes an entire build cycle.
|
||||
|
||||
**Rule: before adding any Debian package name to the ISO config, verify it exists and check its file list.**
|
||||
|
||||
Use one of:
|
||||
- `https://packages.debian.org/bookworm/<package-name>` — existence + description
|
||||
- `https://packages.debian.org/bookworm/amd64/<package-name>/filelist` — exact files installed
|
||||
- `apt-cache show <package>` inside a Debian bookworm container
|
||||
|
||||
This applies to:
|
||||
- `iso/builder/config/package-lists/*.list.chroot`
|
||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||
|
||||
## Memtest rule
|
||||
|
||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||
We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
|
||||
ran, but the final ISO still lacked memtest binaries and menu entries.
|
||||
|
||||
For this project, memtest is accepted only when the produced ISO actually
|
||||
contains all of the following:
|
||||
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a memtest entry in `boot/grub/grub.cfg`
|
||||
- a memtest entry in `isolinux/live.cfg`
|
||||
|
||||
Rules:
|
||||
|
||||
- Keep explicit post-build memtest validation in `build.sh`.
|
||||
- Treat ISO reader success as a separate prerequisite from memtest content.
|
||||
If the reader cannot list or extract from the ISO, that is a validator
|
||||
failure, not proof that memtest is missing.
|
||||
- If built-in integration does not produce the artifacts above, use a
|
||||
deterministic project-owned copy/extract step instead of hoping live-build
|
||||
will "start working".
|
||||
- Do not switch back to built-in-only memtest without fresh build evidence from
|
||||
a real ISO.
|
||||
- If you reference memtest files manually, verify the exact package file list
|
||||
first for the target Debian release.
|
||||
|
||||
Known bad loops for this repository:
|
||||
|
||||
- Do not retry built-in-only memtest without new evidence. We already proved
|
||||
that `lb binary_memtest` can run while the final ISO still has no memtest.
|
||||
- Do not assume fixing memtest file names is enough. Correct names did not fix
|
||||
the final artifact path.
|
||||
- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
|
||||
- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
|
||||
bootloader configs.
|
||||
- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
|
||||
timing is late enough to patch final `binary/boot/grub/grub.cfg` or
|
||||
`binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
|
||||
present yet when the hook executed.
|
||||
- Do not treat a validator warning as ground truth until you have confirmed the
|
||||
ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
|
||||
regression because the final ISO was correct but the validator produced a
|
||||
false negative.
|
||||
35
bible-local/docs/validate-vs-burn.md
Normal file
35
bible-local/docs/validate-vs-burn.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Validate vs Burn: Hardware Impact Policy
|
||||
|
||||
## Validate Tests (non-destructive)
|
||||
|
||||
Tests on the **Validate** page are purely diagnostic. They:
|
||||
|
||||
- **Do not write to disks** — no data is written to storage devices; SMART counters (power-on hours, load cycle count, reallocated sectors) are not incremented.
|
||||
- **Do not run sustained high load** — commands complete quickly (seconds to minutes) and do not push hardware to thermal or electrical limits.
|
||||
- **Do not increment hardware wear counters** — GPU memory ECC counters, NVMe wear leveling counters, and similar endurance metrics are unaffected.
|
||||
- **Are safe to run repeatedly** — on new, production-bound, or already-deployed hardware without concern for reducing lifespan.
|
||||
|
||||
### What Validate tests actually do
|
||||
|
||||
| Test | What it runs |
|
||||
|---|---|
|
||||
| NVIDIA GPU | `nvidia-smi`, `dcgmi diag` (levels 1–4 read-only diagnostics) |
|
||||
| Memory | `memtester` on a limited allocation; reads/writes to RAM only |
|
||||
| Storage | `smartctl -a`, `nvme smart-log` — reads SMART data only |
|
||||
| CPU | `stress-ng` for a bounded duration; CPU-only, no I/O |
|
||||
| AMD GPU | `rocm-smi --showallinfo`, `dmidecode` — read-only queries |
|
||||
|
||||
## Burn Tests (hardware wear)
|
||||
|
||||
Tests on the **Burn** page run hardware at maximum or near-maximum load for extended durations. They:
|
||||
|
||||
- **Wear storage**: write-intensive patterns can reduce SSD endurance (P/E cycles).
|
||||
- **Stress GPU memory**: extended ECC stress tests may surface latent defects but also exercise memory cells.
|
||||
- **Accelerate thermal cycling**: repeated heat/cool cycles degrade solder joints and capacitors over time.
|
||||
- **May increment wear counters**: GPU power-on hours, NVMe media wear indicator, and similar metrics will advance.
|
||||
|
||||
### Rule
|
||||
|
||||
> Run **Validate** freely on any server, at any time, before or after deployment.
|
||||
> Run **Burn** only when explicitly required (e.g., initial acceptance after repair, or per customer SLA).
|
||||
> Document when and why Burn tests were run.
|
||||
Submodule internal/chart updated: 05db6994d4...ac8120c8ab
59
iso/README.md
Normal file
59
iso/README.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# ISO Build
|
||||
|
||||
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Docker Desktop or another Docker-compatible container runtime
|
||||
- Privileged containers enabled
|
||||
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
|
||||
|
||||
## Build On macOS
|
||||
|
||||
From the repository root:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh
|
||||
```
|
||||
|
||||
The script defaults to `linux/amd64` builder containers, so it works on:
|
||||
|
||||
- Intel Mac
|
||||
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
|
||||
|
||||
You do not need to pass `--platform` manually for normal ISO builds.
|
||||
|
||||
## Useful Options
|
||||
|
||||
Build with explicit SSH keys baked into the ISO:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||
```
|
||||
|
||||
Rebuild the builder image:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --rebuild-image
|
||||
```
|
||||
|
||||
Use a custom cache directory:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||
- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
|
||||
- Override the container platform only if you know why:
|
||||
|
||||
```sh
|
||||
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
|
||||
```
|
||||
|
||||
- The shipped ISO is still `amd64`.
|
||||
- Output ISO artifacts are written under `dist/`.
|
||||
@@ -17,15 +17,40 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
wget \
|
||||
curl \
|
||||
tar \
|
||||
libarchive-tools \
|
||||
xz-utils \
|
||||
rsync \
|
||||
build-essential \
|
||||
gcc \
|
||||
make \
|
||||
perl \
|
||||
pkg-config \
|
||||
yasm \
|
||||
libssl-dev \
|
||||
zlib1g-dev \
|
||||
libbz2-dev \
|
||||
libgmp-dev \
|
||||
libpcap-dev \
|
||||
libsqlite3-dev \
|
||||
libcurl4-openssl-dev \
|
||||
ocl-icd-opencl-dev \
|
||||
linux-headers-amd64 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests)
|
||||
RUN wget -qO /tmp/cuda-keyring.gpg \
|
||||
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \
|
||||
&& gpg --dearmor < /tmp/cuda-keyring.gpg \
|
||||
> /usr/share/keyrings/nvidia-cuda.gpg \
|
||||
&& rm /tmp/cuda-keyring.gpg \
|
||||
&& echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \
|
||||
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
||||
> /etc/apt/sources.list.d/cuda.list \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -y cuda-nvcc-12-8 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& ln -sfn /usr/local/cuda-12.8 /usr/local/cuda
|
||||
|
||||
RUN arch="$(dpkg --print-architecture)" \
|
||||
&& case "$arch" in \
|
||||
amd64) goarch=amd64 ;; \
|
||||
|
||||
@@ -4,5 +4,20 @@ NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||
ROCM_VALIDATION_SUITE_VERSION=1.1.0.60304-76~22.04
|
||||
ROCBLAS_VERSION=4.3.0.60304-76~22.04
|
||||
ROCRAND_VERSION=3.2.0.60304-76~22.04
|
||||
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
|
||||
@@ -29,9 +29,10 @@ lb config noauto \
|
||||
--security true \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest none \
|
||||
--iso-volume "EASY-BEE" \
|
||||
--iso-application "EASY-BEE" \
|
||||
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
190
iso/builder/build-cublas.sh
Normal file
190
iso/builder/build-cublas.sh
Normal file
@@ -0,0 +1,190 @@
|
||||
#!/bin/sh
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
|
||||
#
|
||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||
# - headers for compiling bee-gpu-burn worker against cuBLASLt
|
||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||
|
||||
set -e
|
||||
|
||||
CUBLAS_VERSION="$1"
|
||||
CUDA_USERSPACE_VERSION="$2"
|
||||
CUDA_SERIES="$3"
|
||||
DIST_DIR="$4"
|
||||
|
||||
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
|
||||
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
|
||||
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||
|
||||
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
|
||||
|
||||
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
|
||||
&& [ -f "${CACHE_DIR}/include/crt/host_defines.h" ] \
|
||||
&& [ -f "${CACHE_DIR}/include/nv/target" ] \
|
||||
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== cuBLAS cached, skipping download ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
|
||||
|
||||
echo "=== downloading Packages.gz ==="
|
||||
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
|
||||
|
||||
lookup_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2" # if empty, match any version (first found)
|
||||
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
|
||||
/^Package: / { cur_pkg=$2; gsub(/\r/, "", cur_pkg) }
|
||||
/^Version: / { cur_ver=$2; gsub(/\r/, "", cur_ver) }
|
||||
/^Filename: / { cur_file=$2; gsub(/\r/, "", cur_file) }
|
||||
/^SHA256: / { cur_sha=$2; gsub(/\r/, "", cur_sha) }
|
||||
/^$/ {
|
||||
if (cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||
print cur_file " " cur_sha
|
||||
printed=1
|
||||
exit
|
||||
}
|
||||
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
|
||||
}
|
||||
END {
|
||||
if (!printed && cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||
print cur_file " " cur_sha
|
||||
}
|
||||
}'
|
||||
}
|
||||
|
||||
download_verified_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
|
||||
meta="$(lookup_pkg "$pkg" "$ver")"
|
||||
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
|
||||
|
||||
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
|
||||
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
|
||||
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
|
||||
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
|
||||
|
||||
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
|
||||
if [ -f "$out" ]; then
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" = "$repo_sha" ]; then
|
||||
echo "=== using cached $(basename "$repo_file") ===" >&2
|
||||
printf '%s\n' "$out"
|
||||
return 0
|
||||
fi
|
||||
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ===" >&2
|
||||
rm -f "$out"
|
||||
fi
|
||||
|
||||
echo "=== downloading $(basename "$repo_file") ===" >&2
|
||||
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
|
||||
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" != "$repo_sha" ]; then
|
||||
echo "ERROR: sha256 mismatch for $(basename "$repo_file")" >&2
|
||||
echo " expected: $repo_sha" >&2
|
||||
echo " actual: $actual_sha" >&2
|
||||
rm -f "$out"
|
||||
exit 1
|
||||
fi
|
||||
echo "sha256 OK: $(basename "$repo_file")" >&2
|
||||
printf '%s\n' "$out"
|
||||
}
|
||||
|
||||
extract_deb() {
|
||||
deb="$1"
|
||||
dst="$2"
|
||||
mkdir -p "$dst"
|
||||
(
|
||||
cd "$dst"
|
||||
ar x "$deb"
|
||||
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
|
||||
tar xf "$data_tar"
|
||||
)
|
||||
}
|
||||
|
||||
copy_headers() {
|
||||
from="$1"
|
||||
if [ -d "${from}/usr/include" ]; then
|
||||
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
|
||||
fi
|
||||
# NVIDIA CUDA packages install headers under /usr/local/cuda-X.Y/targets/x86_64-linux/include/
|
||||
find "$from" -type d -name include | while read -r inc_dir; do
|
||||
case "$inc_dir" in
|
||||
*/usr/include) ;; # already handled above
|
||||
*)
|
||||
if find "${inc_dir}" -maxdepth 3 \( -name '*.h' -o -type f \) | grep -q .; then
|
||||
cp -a "${inc_dir}/." "${CACHE_DIR}/include/"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
copy_libs() {
|
||||
from="$1"
|
||||
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
|
||||
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
}
|
||||
|
||||
make_links() {
|
||||
base="$1"
|
||||
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
|
||||
[ -n "$versioned" ] || return 0
|
||||
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
|
||||
target=$(basename "$versioned")
|
||||
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
|
||||
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
|
||||
}
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
|
||||
|
||||
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
CUDA_CRT_DEB=$(download_verified_pkg "cuda-crt-${CUDA_SERIES_DASH}" "")
|
||||
CUDA_CCCL_DEB=$(download_verified_pkg "cuda-cccl-${CUDA_SERIES_DASH}" "")
|
||||
|
||||
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
|
||||
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
|
||||
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
|
||||
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
|
||||
extract_deb "$CUDA_CRT_DEB" "${TMP_DIR}/cuda-crt"
|
||||
extract_deb "$CUDA_CCCL_DEB" "${TMP_DIR}/cuda-cccl"
|
||||
|
||||
copy_headers "${TMP_DIR}/cublas-dev"
|
||||
copy_headers "${TMP_DIR}/cudart-dev"
|
||||
copy_headers "${TMP_DIR}/cuda-crt"
|
||||
copy_headers "${TMP_DIR}/cuda-cccl"
|
||||
copy_libs "${TMP_DIR}/cublas-rt"
|
||||
copy_libs "${TMP_DIR}/cudart-rt"
|
||||
|
||||
make_links "libcublas"
|
||||
make_links "libcublasLt"
|
||||
make_links "libcudart"
|
||||
|
||||
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
|
||||
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
|
||||
|
||||
echo "=== cuBLAS extraction complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
|
||||
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"
|
||||
@@ -7,9 +7,12 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
CLEAN_CACHE=0
|
||||
VARIANT="all"
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
|
||||
@@ -27,19 +30,54 @@ while [ $# -gt 0 ]; do
|
||||
AUTH_KEYS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--clean-build)
|
||||
CLEAN_CACHE=1
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--variant)
|
||||
VARIANT="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--authorized-keys /path/to/authorized_keys]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
||||
rm -rf "${CACHE_DIR:?}/go-build" \
|
||||
"${CACHE_DIR:?}/go-mod" \
|
||||
"${CACHE_DIR:?}/tmp" \
|
||||
"${CACHE_DIR:?}/bee" \
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
|
||||
echo "container tool not found: $CONTAINER_TOOL" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
|
||||
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
|
||||
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
|
||||
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
|
||||
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
|
||||
@@ -56,41 +94,101 @@ mkdir -p \
|
||||
|
||||
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
|
||||
|
||||
if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
image_matches_platform() {
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
|
||||
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
|
||||
}
|
||||
|
||||
NEED_BUILD_IMAGE=0
|
||||
if [ "$REBUILD_IMAGE" = "1" ]; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! image_matches_platform; then
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
|
||||
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
|
||||
NEED_BUILD_IMAGE=1
|
||||
fi
|
||||
|
||||
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
|
||||
"$CONTAINER_TOOL" build \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
--build-arg GO_VERSION="${GO_VERSION}" \
|
||||
-t "${IMAGE_REF}" \
|
||||
"${BUILDER_DIR}"
|
||||
else
|
||||
echo "=== using existing builder image ${IMAGE_REF} ==="
|
||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||
fi
|
||||
|
||||
set -- \
|
||||
run --rm --privileged \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
set -- run --rm --privileged \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
# Build base docker run args (without --authorized-keys)
|
||||
build_run_args() {
|
||||
_variant="$1"
|
||||
_auth_arg=""
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
echo "run --rm --privileged \
|
||||
--platform ${BUILDER_PLATFORM} \
|
||||
-v ${REPO_ROOT}:/work \
|
||||
-v ${CACHE_DIR}:/cache \
|
||||
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
${IMAGE_REF} \
|
||||
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
|
||||
}
|
||||
|
||||
"$CONTAINER_TOOL" "$@"
|
||||
run_variant() {
|
||||
_v="$1"
|
||||
echo "=== building variant: ${_v} ==="
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
else
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
fi
|
||||
}
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia)
|
||||
run_variant nvidia
|
||||
;;
|
||||
amd)
|
||||
run_variant amd
|
||||
;;
|
||||
nogpu)
|
||||
run_variant nogpu
|
||||
;;
|
||||
all)
|
||||
run_variant nvidia
|
||||
run_variant amd
|
||||
run_variant nogpu
|
||||
;;
|
||||
esac
|
||||
|
||||
55
iso/builder/build-john.sh
Normal file
55
iso/builder/build-john.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/sh
|
||||
# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
|
||||
#
|
||||
# Downloads a pinned source snapshot from the official openwall/john repository,
|
||||
# builds it inside the builder container, and caches the resulting run/ tree.
|
||||
|
||||
set -e
|
||||
|
||||
JOHN_COMMIT="$1"
|
||||
DIST_DIR="$2"
|
||||
|
||||
[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
|
||||
echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
|
||||
SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
|
||||
|
||||
if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
|
||||
echo "=== john cached, skipping build ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}"
|
||||
if [ ! -f "${SRC_TAR}" ]; then
|
||||
echo "=== downloading john source snapshot ==="
|
||||
wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
|
||||
fi
|
||||
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
|
||||
|
||||
cd "${BUILD_TMP}"
|
||||
tar xf "${SRC_TAR}"
|
||||
SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
|
||||
[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
|
||||
|
||||
cd "${SRC_DIR}/src"
|
||||
echo "=== configuring john ==="
|
||||
./configure
|
||||
echo "=== building john ==="
|
||||
make clean >/dev/null 2>&1 || true
|
||||
make -j"$(nproc)"
|
||||
|
||||
mkdir -p "${CACHE_DIR}"
|
||||
cp -a "../run" "${CACHE_DIR}/run"
|
||||
chmod +x "${CACHE_DIR}/run/john"
|
||||
|
||||
echo "=== john build complete ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
164
iso/builder/build-nccl-tests.sh
Executable file
164
iso/builder/build-nccl-tests.sh
Executable file
@@ -0,0 +1,164 @@
|
||||
#!/bin/sh
|
||||
# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD.
|
||||
#
|
||||
# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for
|
||||
# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0).
|
||||
#
|
||||
# Output is cached in DIST_DIR/nccl-tests-<version>/ so subsequent builds
|
||||
# are instant unless NCCL_TESTS_VERSION changes.
|
||||
#
|
||||
# Output layout:
|
||||
# $CACHE_DIR/bin/all_reduce_perf
|
||||
# $CACHE_DIR/lib/libcudart.so* copied from the nvcc toolchain used to build nccl-tests
|
||||
|
||||
set -e
|
||||
|
||||
NCCL_TESTS_VERSION="$1"
|
||||
NCCL_VERSION="$2"
|
||||
NCCL_CUDA_VERSION="$3"
|
||||
DIST_DIR="$4"
|
||||
NVCC_VERSION="${5:-}"
|
||||
DEBIAN_VERSION="${6:-12}"
|
||||
|
||||
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
|
||||
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||
|
||||
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ] && [ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== nccl-tests cached, skipping build ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Resolve nvcc path (cuda-nvcc-X-Y installs to /usr/local/cuda-X.Y/bin/nvcc)
|
||||
NVCC_VERSION_PATH="$(echo "${NVCC_VERSION}" | tr '.' '.')"
|
||||
NVCC=""
|
||||
for candidate in nvcc "/usr/local/cuda-${NVCC_VERSION_PATH}/bin/nvcc" /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do
|
||||
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
|
||||
NVCC="$candidate"
|
||||
break
|
||||
fi
|
||||
done
|
||||
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-$(echo "${NVCC_VERSION}" | tr '.' '-')"; exit 1; }
|
||||
echo "nvcc: $NVCC"
|
||||
|
||||
# Determine CUDA_HOME from nvcc location
|
||||
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||
echo "CUDA_HOME: $CUDA_HOME"
|
||||
|
||||
find_cudart_dir() {
|
||||
for dir in \
|
||||
"${CUDA_HOME}/targets/x86_64-linux/lib" \
|
||||
"${CUDA_HOME}/targets/x86_64-linux/lib/stubs" \
|
||||
"${CUDA_HOME}/lib64" \
|
||||
"${CUDA_HOME}/lib"; do
|
||||
if [ -d "$dir" ] && find "$dir" -maxdepth 1 -name 'libcudart.so*' -type f | grep -q .; then
|
||||
printf '%s\n' "$dir"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
CUDART_DIR="$(find_cudart_dir)" || { echo "ERROR: libcudart.so* not found under ${CUDA_HOME}"; exit 1; }
|
||||
echo "cudart dir: $CUDART_DIR"
|
||||
|
||||
# Download libnccl-dev for nccl.h
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
|
||||
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||
DEV_URL="${REPO_BASE}/${DEV_PKG}"
|
||||
|
||||
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
||||
DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}"
|
||||
|
||||
if [ ! -f "$DEV_DEB" ]; then
|
||||
echo "=== downloading libnccl-dev ==="
|
||||
wget --show-progress -O "$DEV_DEB" "$DEV_URL"
|
||||
fi
|
||||
|
||||
# Extract nccl.h from libnccl-dev
|
||||
NCCL_INCLUDE_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM
|
||||
|
||||
cd "$NCCL_INCLUDE_TMP"
|
||||
ar x "$DEV_DEB"
|
||||
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; }
|
||||
tar xf "$DATA_TAR"
|
||||
|
||||
# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/
|
||||
NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1)
|
||||
[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; }
|
||||
NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")"
|
||||
echo "nccl.h: $NCCL_H"
|
||||
|
||||
# libnccl.so comes from the already-built NCCL cache (build-nccl.sh ran first)
|
||||
NCCL_LIB_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}/lib"
|
||||
[ -d "$NCCL_LIB_DIR" ] || { echo "ERROR: NCCL lib dir not found at $NCCL_LIB_DIR — run build-nccl.sh first"; exit 1; }
|
||||
echo "nccl lib: $NCCL_LIB_DIR"
|
||||
|
||||
# Download nccl-tests source
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz"
|
||||
SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz"
|
||||
|
||||
if [ ! -f "$SRC_TAR" ]; then
|
||||
echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ==="
|
||||
wget --show-progress -O "$SRC_TAR" "$SRC_URL"
|
||||
fi
|
||||
|
||||
# Extract and build
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
cd "$BUILD_TMP"
|
||||
tar xf "$SRC_TAR"
|
||||
SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
|
||||
[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; }
|
||||
cd "$SRC_DIR"
|
||||
|
||||
echo "=== building all_reduce_perf ==="
|
||||
# Pick gencode based on the actual nvcc version:
|
||||
# CUDA 12.x — Volta..Blackwell (sm_70..sm_100)
|
||||
# CUDA 13.x — Hopper..Blackwell (sm_90..sm_100, Pascal/Volta/Ampere dropped)
|
||||
NVCC_MAJOR=$("$NVCC" --version 2>/dev/null | grep -oE 'release [0-9]+' | awk '{print $2}' | head -1)
|
||||
echo "nvcc major version: ${NVCC_MAJOR:-unknown}"
|
||||
if [ "${NVCC_MAJOR:-0}" -ge 13 ] 2>/dev/null; then
|
||||
GENCODE="-gencode=arch=compute_90,code=sm_90 \
|
||||
-gencode=arch=compute_100,code=sm_100"
|
||||
echo "gencode: sm_90 sm_100 (CUDA 13+)"
|
||||
else
|
||||
GENCODE="-gencode=arch=compute_70,code=sm_70 \
|
||||
-gencode=arch=compute_80,code=sm_80 \
|
||||
-gencode=arch=compute_86,code=sm_86 \
|
||||
-gencode=arch=compute_90,code=sm_90 \
|
||||
-gencode=arch=compute_100,code=sm_100"
|
||||
echo "gencode: sm_70..sm_100 (CUDA 12)"
|
||||
fi
|
||||
LIBRARY_PATH="$NCCL_LIB_DIR${LIBRARY_PATH:+:$LIBRARY_PATH}" \
|
||||
make MPI=0 \
|
||||
NVCC="$NVCC" \
|
||||
CUDA_HOME="$CUDA_HOME" \
|
||||
NCCL_HOME="$NCCL_INCLUDE_DIR/.." \
|
||||
NCCL_LIB="$NCCL_LIB_DIR" \
|
||||
NVCC_GENCODE="$GENCODE" \
|
||||
BUILDDIR="./build"
|
||||
|
||||
[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; }
|
||||
|
||||
mkdir -p "${CACHE_DIR}/bin"
|
||||
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
|
||||
mkdir -p "${CACHE_DIR}/lib"
|
||||
find "${CUDART_DIR}" -maxdepth 1 -name 'libcudart.so*' -type f -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' -type f | wc -l)" -gt 0 ] || { echo "ERROR: libcudart runtime copy failed"; exit 1; }
|
||||
|
||||
echo "=== nccl-tests build complete ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
ls -lh "${CACHE_DIR}/lib/"libcudart.so* 2>/dev/null || true
|
||||
@@ -10,7 +10,7 @@
|
||||
# Output layout:
|
||||
# $CACHE_DIR/modules/ — nvidia*.ko files
|
||||
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
|
||||
|
||||
set -e
|
||||
|
||||
@@ -46,7 +46,11 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
|
||||
CACHE_LAYOUT_VERSION="2"
|
||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||
@@ -129,15 +133,30 @@ else
|
||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||
fi
|
||||
|
||||
# Copy ALL userspace library files
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
done
|
||||
if [ "$count" -eq 0 ]; then
|
||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
||||
# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
|
||||
# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
|
||||
# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
|
||||
# but still fail with "no OpenCL platforms" because one dependent .so is absent.
|
||||
copied_libs=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
|
||||
cp "$f" "$CACHE_DIR/lib/"
|
||||
copied_libs=$((copied_libs+1))
|
||||
done
|
||||
|
||||
if [ "$copied_libs" -eq 0 ]; then
|
||||
echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl; do
|
||||
if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
|
||||
echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
|
||||
ls "$CACHE_DIR/lib/" | sort >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
@@ -146,16 +165,17 @@ done
|
||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
# Create soname symlinks for every copied versioned library.
|
||||
for versioned in "$CACHE_DIR"/lib/*.so.*; do
|
||||
[ -f "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
||||
echo "${lib}: .so.1 -> $base"
|
||||
stem=${base%%.so.*}
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
|
||||
ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
|
||||
done
|
||||
|
||||
touch "$CACHE_LAYOUT_MARKER"
|
||||
|
||||
echo "=== NVIDIA build complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $ko_count .ko files"
|
||||
|
||||
1030
iso/builder/build.sh
1030
iso/builder/build.sh
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user