Compare commits
165 Commits
72cf482ad3
...
v3.16
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac | |||
| 3dbc2184ef | |||
| 60cb8f889a | |||
| c9ee078622 | |||
| ea660500c9 | |||
| d43a9aeec7 | |||
|
|
f5622e351e | ||
|
|
a20806afc8 | ||
|
|
4f9b6b3bcd | ||
|
|
c850b39b01 | ||
|
|
6dee8f3509 | ||
|
|
20f834aa96 | ||
| 105d92df8b | |||
| f96b149875 | |||
| 5ee120158e | |||
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 | |||
| 4db89e9773 | |||
| 3fda18f708 | |||
| ea518abf30 | |||
| 744de588bb | |||
| a3ed9473a3 | |||
| a714c45f10 | |||
| 349e026cfa | |||
| 889fe1dc2f | |||
| befdbf3768 | |||
| ec6a0b292d | |||
| a03312c286 | |||
| e69e9109da | |||
| 413869809d | |||
| f9bd38572a | |||
| 662e3d2cdd | |||
| 126af96780 | |||
| ada15ac777 | |||
| dfb94f9ca6 | |||
| 5857805518 | |||
| 59a1d4b209 | |||
| 0dbfaf6121 | |||
| 5d72d48714 | |||
| 096b4a09ca | |||
| 5d42a92e4c | |||
| 3e54763367 | |||
| f91bce8661 | |||
| 585e6d7311 | |||
| 0a98ed8ae9 | |||
| 911745e4da | |||
| acfd2010d7 | |||
| e904c13790 | |||
| 24c5c72cee | |||
| 6ff0bcad56 | |||
| 4fef26000c | |||
| a393dcb731 | |||
| 9e55728053 | |||
| 4b8023c1cb | |||
| 4c8417d20a | |||
| 0755374dd2 | |||
| c70ae274fa | |||
| 23ad7ff534 | |||
| de130966f7 | |||
| c6fbfc8306 | |||
| 35ad1c74d9 | |||
| 4a02e74b17 | |||
| cd2853ad99 | |||
| 6caf771d6e | |||
| 14fa87b7d7 | |||
| 600ece911b | |||
| 2d424c63cb | |||
| 50f28d1ee6 | |||
| 3579747ae3 | |||
| 09dc7d2613 | |||
| ec0b7f7ff9 | |||
| e7a7ff54b9 | |||
| b4371e291e | |||
| c22b53a406 | |||
| ff0acc3698 | |||
| d50760e7c6 | |||
| ed4f8be019 | |||
| 883592d029 | |||
| a6dcaf1c7e | |||
| 88727fb590 | |||
| c9f5224c42 | |||
| 7cb5c02a9b | |||
| c1aa3cf491 | |||
| f7eb75c57c | |||
| 004cc4910d | |||
| ed1cceed8c | |||
| 9fe9f061f8 | |||
| 837a1fb981 | |||
| 1f43b4e050 | |||
| 83bbc8a1bc | |||
| 896bdb6ee8 | |||
| 5407c26e25 | |||
| 4fddaba9c5 | |||
| d2f384b6eb | |||
| 25f0f30aaf | |||
| a57b037a91 | |||
| 5644231f9a | |||
| eea98e6d76 | |||
| 967455194c | |||
| 79dabf3efb | |||
| 1336f5b95c | |||
| 31486a31c1 | |||
| aa3fc332ba | |||
| 62c57b87f2 | |||
| f600261546 | |||
| d7ca04bdfb | |||
| 5433652c70 | |||
| b25f014dbd | |||
| d69a46f211 | |||
|
|
fc5c2019aa | ||
|
|
67a215c66f | ||
|
|
8b4bfdf5ad | ||
|
|
0a52a4f3ba | ||
|
|
b132f7973a | ||
|
|
bd94b6c792 | ||
|
|
06017eddfd | ||
|
|
0ac7b6a963 | ||
|
|
3d2ae4cdcb | ||
|
|
4669f14f4f | ||
|
|
540a9e39b8 | ||
|
|
58510207fa | ||
|
|
4cd7c9ab4e | ||
|
|
cfe255f6e4 | ||
|
|
8b9d3447d7 | ||
|
|
614b7cad61 | ||
|
|
9a1df9b1ba | ||
|
|
30cf014d58 | ||
|
|
27d478aed6 | ||
|
|
d36e8442a9 | ||
|
|
b345b0d14d | ||
|
|
0a1ac2ab9f | ||
|
|
1e62f828c6 | ||
|
|
f8c997d272 | ||
|
|
0c16616cc9 | ||
|
|
adcc147b32 | ||
|
|
94e233651e | ||
|
|
03c36f6cb2 | ||
|
|
a221814797 | ||
|
|
b6619d5ccc | ||
|
|
450193b063 | ||
|
|
ee8931f171 | ||
|
|
b771d95894 | ||
|
|
8e60e474dc | ||
|
|
2f4ec2acda | ||
|
|
7ed5cb0306 | ||
|
|
6df7ac68f5 | ||
|
|
0ce23aea4f | ||
|
|
36dff6e584 | ||
|
|
1c80906c1f | ||
|
|
2abe2ce3aa | ||
|
|
8233c9ee85 | ||
|
|
13189e2683 | ||
|
|
76a17937f3 | ||
|
|
b965184e71 | ||
|
|
b25a2f6d30 | ||
|
|
d18cde19c1 | ||
|
|
78c6dfc0ef |
17
PLAN.md
17
PLAN.md
@@ -272,13 +272,10 @@ ISO image bootable via BMC virtual media or USB. Runs boot services automaticall
|
||||
|
||||
### 2.1 — Builder environment
|
||||
|
||||
`iso/builder/setup-builder.sh` prepares a Debian 12 host/VM with:
|
||||
- `live-build`, `debootstrap`, bootloader tooling, kernel headers
|
||||
- Go toolchain
|
||||
- everything needed to compile the `bee` binary and NVIDIA modules
|
||||
|
||||
`iso/builder/build-in-container.sh` offers the same builder stack in a Debian 12 container image.
|
||||
The container run is privileged because `live-build` needs mount/chroot/loop capabilities.
|
||||
`iso/builder/build-in-container.sh` is the only supported builder entrypoint.
|
||||
It builds a Debian 12 builder image with `live-build`, toolchains, and pinned kernel headers,
|
||||
then runs the ISO assembly in a privileged container because `live-build` needs
|
||||
mount/chroot/loop capabilities.
|
||||
|
||||
`iso/builder/build.sh` orchestrates the full ISO build:
|
||||
1. compile the Go `bee` binary
|
||||
@@ -346,9 +343,9 @@ Planned code shape:
|
||||
- `bee tui` can rerun the audit manually
|
||||
- `bee tui` can export the latest audit JSON to removable media
|
||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
||||
- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
|
||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||
|
||||
### 2.6 — Vendor utilities and optional assets
|
||||
@@ -392,7 +389,7 @@ No "works on my Mac" drift.
|
||||
|
||||
--- BUILDER + BEE ISO (unblock real-hardware testing) ---
|
||||
|
||||
2.1 builder setup → Debian host/VM or privileged container with build deps
|
||||
2.1 builder setup → privileged container with build deps
|
||||
2.2 debug ISO profile → minimal Debian ISO: `bee` binary + OpenSSH + all packages
|
||||
2.3 boot on real server → SSH in, verify packages present, run audit manually
|
||||
|
||||
|
||||
18
audit/Makefile
Normal file
18
audit/Makefile
Normal file
@@ -0,0 +1,18 @@
|
||||
LISTEN ?= :8080
|
||||
AUDIT_PATH ?=
|
||||
|
||||
RUN_ARGS := web --listen $(LISTEN)
|
||||
ifneq ($(AUDIT_PATH),)
|
||||
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||
endif
|
||||
|
||||
.PHONY: run build test
|
||||
|
||||
run:
|
||||
go run ./cmd/bee $(RUN_ARGS)
|
||||
|
||||
build:
|
||||
go build -o bee ./cmd/bee
|
||||
|
||||
test:
|
||||
go test ./...
|
||||
@@ -1,22 +1,54 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"bee/audit/internal/tui"
|
||||
"bee/audit/internal/webui"
|
||||
)
|
||||
|
||||
var Version = "dev"
|
||||
|
||||
func buildLabel() string {
|
||||
label := strings.TrimSpace(Version)
|
||||
if label == "" {
|
||||
label = "dev"
|
||||
}
|
||||
if info, ok := debug.ReadBuildInfo(); ok {
|
||||
var revision string
|
||||
var modified bool
|
||||
for _, setting := range info.Settings {
|
||||
switch setting.Key {
|
||||
case "vcs.revision":
|
||||
revision = setting.Value
|
||||
case "vcs.modified":
|
||||
modified = setting.Value == "true"
|
||||
}
|
||||
}
|
||||
if revision != "" {
|
||||
short := revision
|
||||
if len(short) > 12 {
|
||||
short = short[:12]
|
||||
}
|
||||
label += " (" + short
|
||||
if modified {
|
||||
label += "+"
|
||||
}
|
||||
label += ")"
|
||||
}
|
||||
}
|
||||
return label
|
||||
}
|
||||
|
||||
func main() {
|
||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||
}
|
||||
@@ -40,10 +72,12 @@ func run(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
case "audit":
|
||||
return runAudit(args[1:], stdout, stderr)
|
||||
case "tui":
|
||||
return runTUI(args[1:], stdout, stderr)
|
||||
case "export":
|
||||
return runExport(args[1:], stdout, stderr)
|
||||
case "preflight":
|
||||
return runPreflight(args[1:], stdout, stderr)
|
||||
case "support-bundle":
|
||||
return runSupportBundle(args[1:], stdout, stderr)
|
||||
case "web":
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
@@ -61,10 +95,11 @@ func run(args []string, stdout, stderr io.Writer) int {
|
||||
func printRootUsage(w io.Writer) {
|
||||
fmt.Fprintln(w, `bee commands:
|
||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||
bee tui --runtime auto|local|livecd
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee export --target <device>
|
||||
bee web --listen :80 --audit-path /var/log/bee-audit.json
|
||||
bee sat nvidia|memory|storage
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -73,10 +108,12 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
switch args[0] {
|
||||
case "audit":
|
||||
return runAudit([]string{"--help"}, stdout, stdout)
|
||||
case "tui":
|
||||
return runTUI([]string{"--help"}, stdout, stdout)
|
||||
case "export":
|
||||
return runExport([]string{"--help"}, stdout, stdout)
|
||||
case "preflight":
|
||||
return runPreflight([]string{"--help"}, stdout, stdout)
|
||||
case "support-bundle":
|
||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||
case "web":
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
@@ -135,43 +172,6 @@ func runAudit(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runTUI(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("tui", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee tui [--runtime auto|local|livecd]")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
|
||||
if err != nil {
|
||||
slog.Error("resolve runtime", "err", err)
|
||||
return 1
|
||||
}
|
||||
|
||||
slog.SetDefault(slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
})))
|
||||
|
||||
application := app.New(platform.New())
|
||||
if err := tui.Run(application, runtimeInfo.Mode); err != nil {
|
||||
slog.Error("run tui", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runExport(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
@@ -219,14 +219,96 @@ func runExport(args []string, stdout, stderr io.Writer) int {
|
||||
return 1
|
||||
}
|
||||
|
||||
func runPreflight(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("preflight", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee preflight [--output stdout|file:%s]\n", app.DefaultRuntimeJSONPath)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
application := app.New(platform.New())
|
||||
path, err := application.RunRuntimePreflight(*output)
|
||||
if err != nil {
|
||||
slog.Error("run preflight", "err", err)
|
||||
return 1
|
||||
}
|
||||
if path != "stdout" {
|
||||
slog.Info("runtime health written", "path", path)
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee support-bundle [--output stdout|file:<path>]")
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
path, err := app.BuildSupportBundle(app.DefaultExportDir)
|
||||
if err != nil {
|
||||
slog.Error("build support bundle", "err", err)
|
||||
return 1
|
||||
}
|
||||
defer os.Remove(path)
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
slog.Error("read support bundle", "err", err)
|
||||
return 1
|
||||
}
|
||||
switch {
|
||||
case *output == "stdout":
|
||||
if _, err := stdout.Write(raw); err != nil {
|
||||
slog.Error("write support bundle stdout", "err", err)
|
||||
return 1
|
||||
}
|
||||
case strings.HasPrefix(*output, "file:"):
|
||||
dst := strings.TrimPrefix(*output, "file:")
|
||||
if err := os.WriteFile(dst, raw, 0644); err != nil {
|
||||
slog.Error("write support bundle", "err", err)
|
||||
return 1
|
||||
}
|
||||
slog.Info("support bundle written", "path", dst)
|
||||
default:
|
||||
fmt.Fprintln(stderr, "bee support-bundle: unknown output destination")
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee web [--listen :80] [--audit-path /var/log/bee-audit.json] [--title \"Bee Hardware Audit\"]")
|
||||
fmt.Fprintf(stderr, "usage: bee web [--listen :80] [--audit-path %s] [--export-dir %s] [--title \"Bee Hardware Audit\"]\n", app.DefaultAuditJSONPath, app.DefaultExportDir)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
@@ -241,9 +323,19 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
}
|
||||
|
||||
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||
if err != nil {
|
||||
slog.Warn("resolve runtime for web", "err", err)
|
||||
}
|
||||
|
||||
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
||||
Title: *title,
|
||||
AuditPath: *auditPath,
|
||||
Title: *title,
|
||||
BuildLabel: buildLabel(),
|
||||
AuditPath: *auditPath,
|
||||
ExportDir: *exportDir,
|
||||
App: app.New(platform.New()),
|
||||
RuntimeMode: runtimeInfo.Mode,
|
||||
}); err != nil {
|
||||
slog.Error("run web", "err", err)
|
||||
return 1
|
||||
@@ -253,43 +345,65 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
|
||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
return 2
|
||||
}
|
||||
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage")
|
||||
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
return 0
|
||||
}
|
||||
if args[0] != "nvidia" && args[0] != "memory" && args[0] != "storage" {
|
||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", args[0])
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
|
||||
|
||||
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
||||
if err := fs.Parse(args[1:]); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if len(args) > 1 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage")
|
||||
if fs.NArg() != 0 {
|
||||
fmt.Fprintf(stderr, "bee sat: unexpected arguments\n")
|
||||
return 2
|
||||
}
|
||||
|
||||
target := args[0]
|
||||
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
label string
|
||||
)
|
||||
switch args[0] {
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
switch target {
|
||||
case "nvidia":
|
||||
label = "nvidia"
|
||||
archive, err = application.RunNvidiaAcceptancePack("")
|
||||
level := *diagLevel
|
||||
if level > 0 {
|
||||
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
||||
} else {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
label = "memory"
|
||||
archive, err = application.RunMemoryAcceptancePack("")
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||
case "storage":
|
||||
label = "storage"
|
||||
archive, err = application.RunStorageAcceptancePack("")
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
|
||||
}
|
||||
if err != nil {
|
||||
slog.Error("run sat", "target", label, "err", err)
|
||||
slog.Error("run sat", "target", target, "err", err)
|
||||
return 1
|
||||
}
|
||||
slog.Info("sat archive written", "target", label, "path", archive)
|
||||
slog.Info("sat archive written", "target", target, "path", archive)
|
||||
return 0
|
||||
}
|
||||
|
||||
@@ -91,6 +91,32 @@ func TestRunSATUsage(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunPreflightRejectsExtraArgs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
rc := run([]string{"preflight", "extra"}, &stdout, &stderr)
|
||||
if rc != 2 {
|
||||
t.Fatalf("rc=%d want 2", rc)
|
||||
}
|
||||
if !strings.Contains(stderr.String(), "usage: bee preflight") {
|
||||
t.Fatalf("stderr missing preflight usage:\n%s", stderr.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunSupportBundleRejectsExtraArgs(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
rc := run([]string{"support-bundle", "extra"}, &stdout, &stderr)
|
||||
if rc != 2 {
|
||||
t.Fatalf("rc=%d want 2", rc)
|
||||
}
|
||||
if !strings.Contains(stderr.String(), "usage: bee support-bundle") {
|
||||
t.Fatalf("stderr missing support-bundle usage:\n%s", stderr.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunHelpForSubcommand(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -138,7 +164,7 @@ func TestRunSATHelp(t *testing.T) {
|
||||
if rc != 0 {
|
||||
t.Fatalf("rc=%d want 0", rc)
|
||||
}
|
||||
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage") {
|
||||
if !strings.Contains(stdout.String(), "usage: bee sat nvidia|memory|storage|cpu") {
|
||||
t.Fatalf("stdout missing sat help:\n%s", stdout.String())
|
||||
}
|
||||
}
|
||||
@@ -151,8 +177,8 @@ func TestRunSATRejectsExtraArgs(t *testing.T) {
|
||||
if rc != 2 {
|
||||
t.Fatalf("rc=%d want 2", rc)
|
||||
}
|
||||
if !strings.Contains(stderr.String(), "usage: bee sat nvidia|memory|storage") {
|
||||
t.Fatalf("stderr missing sat usage:\n%s", stderr.String())
|
||||
if !strings.Contains(stderr.String(), "bee sat: unexpected arguments") {
|
||||
t.Fatalf("stderr missing sat error:\n%s", stderr.String())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
35
audit/go.mod
35
audit/go.mod
@@ -1,27 +1,26 @@
|
||||
module bee/audit
|
||||
|
||||
go 1.24.0
|
||||
go 1.25.0
|
||||
|
||||
replace reanimator/chart => ../internal/chart
|
||||
|
||||
require github.com/charmbracelet/bubbletea v1.3.4
|
||||
require reanimator/chart v0.0.0
|
||||
require (
|
||||
github.com/go-analyze/charts v0.5.26
|
||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
||||
github.com/charmbracelet/lipgloss v1.0.0 // indirect
|
||||
github.com/charmbracelet/x/ansi v0.8.0 // indirect
|
||||
github.com/charmbracelet/x/term v0.2.1 // indirect
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/mattn/go-localereader v0.0.1 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.16 // indirect
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
|
||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
||||
github.com/muesli/termenv v0.15.2 // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
golang.org/x/sync v0.11.0 // indirect
|
||||
golang.org/x/sys v0.30.0 // indirect
|
||||
golang.org/x/text v0.3.8 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
golang.org/x/image v0.24.0 // indirect
|
||||
golang.org/x/sys v0.42.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
modernc.org/sqlite v1.48.0 // indirect
|
||||
)
|
||||
|
||||
68
audit/go.sum
68
audit/go.sum
@@ -1,37 +1,37 @@
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
|
||||
github.com/charmbracelet/bubbletea v1.3.4 h1:kCg7B+jSCFPLYRA52SDZjr51kG/fMUEoPoZrkaDHyoI=
|
||||
github.com/charmbracelet/bubbletea v1.3.4/go.mod h1:dtcUCyCGEX3g9tosuYiut3MXgY/Jsv9nKVdibKKRRXo=
|
||||
github.com/charmbracelet/lipgloss v1.0.0 h1:O7VkGDvqEdGi93X+DeqsQ7PKHDgtQfF8j8/O2qFMQNg=
|
||||
github.com/charmbracelet/lipgloss v1.0.0/go.mod h1:U5fy9Z+C38obMs+T+tJqst9VGzlOYGj4ri9reL3qUlo=
|
||||
github.com/charmbracelet/x/ansi v0.8.0 h1:9GTq3xq9caJW8ZrBTe0LIe2fvfLR/bYXKTx2llXn7xE=
|
||||
github.com/charmbracelet/x/ansi v0.8.0/go.mod h1:wdYl/ONOLHLIVmQaxbIYEC/cRKOQyjTkowiI4blgS9Q=
|
||||
github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ=
|
||||
github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
|
||||
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
|
||||
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
|
||||
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
|
||||
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
|
||||
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
|
||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
||||
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
|
||||
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w=
|
||||
golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
||||
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
|
||||
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
|
||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
@@ -17,17 +19,27 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
DefaultAuditJSONPath = "/var/log/bee-audit.json"
|
||||
DefaultAuditLogPath = "/var/log/bee-audit.log"
|
||||
DefaultSATBaseDir = "/var/log/bee-sat"
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
network networkManager
|
||||
services serviceManager
|
||||
exports exportManager
|
||||
tools toolManager
|
||||
sat satRunner
|
||||
network networkManager
|
||||
services serviceManager
|
||||
exports exportManager
|
||||
tools toolManager
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -41,10 +53,15 @@ type networkManager interface {
|
||||
DHCPOne(iface string) (string, error)
|
||||
DHCPAll() (string, error)
|
||||
SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error)
|
||||
SetInterfaceState(iface string, up bool) error
|
||||
GetInterfaceState(iface string) (bool, error)
|
||||
CaptureNetworkSnapshot() (platform.NetworkSnapshot, error)
|
||||
RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error
|
||||
}
|
||||
|
||||
type serviceManager interface {
|
||||
ListBeeServices() ([]string, error)
|
||||
ServiceState(name string) string
|
||||
ServiceStatus(name string) (string, error)
|
||||
ServiceDo(name string, action platform.ServiceAction) (string, error)
|
||||
}
|
||||
@@ -59,24 +76,94 @@ type toolManager interface {
|
||||
CheckTools(names []string) []platform.ToolStatus
|
||||
}
|
||||
|
||||
type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
type GPUPresenceResult struct {
|
||||
Nvidia bool
|
||||
AMD bool
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUPresence() GPUPresenceResult {
|
||||
vendor := a.sat.DetectGPUVendor()
|
||||
return GPUPresenceResult{
|
||||
Nvidia: vendor == "nvidia",
|
||||
AMD: vendor == "amd",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string) (string, error)
|
||||
RunMemoryAcceptancePack(baseDir string) (string, error)
|
||||
RunStorageAcceptancePack(baseDir string) (string, error)
|
||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error)
|
||||
CaptureTechnicalDump(baseDir string) error
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
tools: platform,
|
||||
sat: platform,
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
tools: platform,
|
||||
sat: platform,
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||
slog.Warn("capture technical dump", "err", err)
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
data, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -88,6 +175,9 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -97,6 +187,72 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) RunRuntimePreflight(output string) (string, error) {
|
||||
health, err := a.runtime.CollectRuntimeHealth(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
data, err := json.MarshalIndent(health, "", " ")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
switch {
|
||||
case output == "stdout":
|
||||
_, err := os.Stdout.Write(append(data, '\n'))
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
default:
|
||||
return "", fmt.Errorf("unknown output destination %q — use stdout or file:<path>", output)
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) RunRuntimePreflightResult() (ActionResult, error) {
|
||||
path, err := a.RunRuntimePreflight("file:" + DefaultRuntimeJSONPath)
|
||||
body := "Runtime preflight completed."
|
||||
if path != "" {
|
||||
body = "Runtime health written to " + path
|
||||
}
|
||||
return ActionResult{Title: "Run self-check", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RuntimeHealthResult() ActionResult {
|
||||
health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath)
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Runtime issues", Body: "No runtime health found."}
|
||||
}
|
||||
driverLabel := "Driver ready"
|
||||
accelLabel := "CUDA ready"
|
||||
switch a.sat.DetectGPUVendor() {
|
||||
case "amd":
|
||||
driverLabel = "AMDGPU ready"
|
||||
accelLabel = "ROCm SMI ready"
|
||||
case "nvidia":
|
||||
driverLabel = "NVIDIA ready"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "Status: %s\n", firstNonEmpty(health.Status, "UNKNOWN"))
|
||||
fmt.Fprintf(&body, "Export dir: %s\n", firstNonEmpty(health.ExportDir, DefaultExportDir))
|
||||
fmt.Fprintf(&body, "%s: %t\n", driverLabel, health.DriverReady)
|
||||
fmt.Fprintf(&body, "%s: %t\n", accelLabel, health.CUDAReady)
|
||||
fmt.Fprintf(&body, "Network: %s", firstNonEmpty(health.NetworkStatus, "UNKNOWN"))
|
||||
if len(health.Issues) > 0 {
|
||||
body.WriteString("\n\nIssues:\n")
|
||||
for _, issue := range health.Issues {
|
||||
fmt.Fprintf(&body, "- %s: %s\n", issue.Code, issue.Description)
|
||||
}
|
||||
}
|
||||
return ActionResult{Title: "Runtime issues", Body: strings.TrimSpace(body.String())}
|
||||
}
|
||||
|
||||
func (a *App) RunAuditNow(runtimeMode runtimeenv.Mode) (ActionResult, error) {
|
||||
path, err := a.RunAudit(runtimeMode, "file:"+DefaultAuditJSONPath)
|
||||
body := "Audit completed."
|
||||
@@ -129,13 +285,37 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit exported."
|
||||
if path != "" {
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
return a.exports.ExportFileToTarget(archive, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||
return a.network.ListInterfaces()
|
||||
}
|
||||
@@ -166,6 +346,22 @@ func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return a.network.CaptureNetworkSnapshot()
|
||||
}
|
||||
|
||||
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
@@ -222,6 +418,10 @@ func (a *App) ListBeeServices() ([]string, error) {
|
||||
return a.services.ListBeeServices()
|
||||
}
|
||||
|
||||
func (a *App) ServiceState(name string) string {
|
||||
return a.services.ServiceState(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatus(name string) (string, error) {
|
||||
return a.services.ServiceStatus(name)
|
||||
}
|
||||
@@ -277,12 +477,15 @@ func (a *App) AuditLogTailResult() ActionResult {
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir)
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.sat.RunNvidiaAcceptancePack(baseDir)
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
@@ -290,30 +493,235 @@ func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
return a.sat.RunMemoryAcceptancePack(baseDir)
|
||||
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.sat.RunMemoryAcceptancePack(baseDir)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "Memory SAT", Body: body}, err
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
return a.sat.RunStorageAcceptancePack(baseDir)
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.sat.RunStorageAcceptancePack(baseDir)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUVendor() string {
|
||||
return a.sat.DetectGPUVendor()
|
||||
}
|
||||
|
||||
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return ActionResult{Title: "Storage SAT", Body: body}, err
|
||||
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||
}
|
||||
|
||||
func (a *App) HealthSummaryResult() ActionResult {
|
||||
@@ -435,6 +843,18 @@ func bodyOr(body, fallback string) string {
|
||||
return body
|
||||
}
|
||||
|
||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return schema.RuntimeHealth{}, err
|
||||
}
|
||||
var health schema.RuntimeHealth
|
||||
if err := json.Unmarshal(raw, &health); err != nil {
|
||||
return schema.RuntimeHealth{}, err
|
||||
}
|
||||
return health, nil
|
||||
}
|
||||
|
||||
func latestSATSummaries() []string {
|
||||
patterns := []struct {
|
||||
label string
|
||||
@@ -443,6 +863,7 @@ func latestSATSummaries() []string {
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
}
|
||||
var out []string
|
||||
for _, item := range patterns {
|
||||
@@ -647,12 +1068,17 @@ func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
class := trimPtr(dev.DeviceClass)
|
||||
model := strings.ToLower(trimPtr(dev.Model))
|
||||
vendor := strings.ToLower(trimPtr(dev.Manufacturer))
|
||||
// Exclude ASPEED (BMC VGA adapter, not a compute GPU)
|
||||
if strings.Contains(vendor, "aspeed") || strings.Contains(model, "aspeed") {
|
||||
return false
|
||||
}
|
||||
// AMD Instinct / Radeon compute GPUs have class ProcessingAccelerator or DisplayController.
|
||||
// Do NOT match by AMD vendor alone — chipset/CPU PCIe devices share that vendor.
|
||||
return class == "VideoController" ||
|
||||
class == "DisplayController" ||
|
||||
class == "ProcessingAccelerator" ||
|
||||
strings.Contains(model, "nvidia") ||
|
||||
strings.Contains(vendor, "nvidia") ||
|
||||
strings.Contains(vendor, "amd")
|
||||
strings.Contains(vendor, "nvidia")
|
||||
}
|
||||
|
||||
func trimPtr(value *string) string {
|
||||
@@ -725,3 +1151,70 @@ func firstNonEmpty(values ...string) string {
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||
return a.installer.ListInstallDisks()
|
||||
}
|
||||
|
||||
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||
}
|
||||
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
@@ -39,6 +43,13 @@ func (f fakeNetwork) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error
|
||||
return f.setStaticIPv4Fn(cfg)
|
||||
}
|
||||
|
||||
func (f fakeNetwork) SetInterfaceState(_ string, _ bool) error { return nil }
|
||||
func (f fakeNetwork) GetInterfaceState(_ string) (bool, error) { return true, nil }
|
||||
func (f fakeNetwork) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return platform.NetworkSnapshot{}, nil
|
||||
}
|
||||
func (f fakeNetwork) RestoreNetworkSnapshot(platform.NetworkSnapshot) error { return nil }
|
||||
|
||||
type fakeServices struct {
|
||||
serviceStatusFn func(string) (string, error)
|
||||
serviceDoFn func(string, platform.ServiceAction) (string, error)
|
||||
@@ -48,6 +59,10 @@ func (f fakeServices) ListBeeServices() ([]string, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeServices) ServiceState(name string) string {
|
||||
return "active"
|
||||
}
|
||||
|
||||
func (f fakeServices) ServiceStatus(name string) (string, error) {
|
||||
return f.serviceStatusFn(name)
|
||||
}
|
||||
@@ -56,16 +71,41 @@ func (f fakeServices) ServiceDo(name string, action platform.ServiceAction) (str
|
||||
return f.serviceDoFn(name, action)
|
||||
}
|
||||
|
||||
type fakeExports struct{}
|
||||
type fakeExports struct {
|
||||
listTargetsFn func() ([]platform.RemovableTarget, error)
|
||||
exportToTargetFn func(string, platform.RemovableTarget) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeExports) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
if f.listTargetsFn != nil {
|
||||
return f.listTargetsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeExports) ExportFileToTarget(src string, target platform.RemovableTarget) (string, error) {
|
||||
if f.exportToTargetFn != nil {
|
||||
return f.exportToTargetFn(src, target)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
type fakeRuntime struct {
|
||||
collectFn func(string) (schema.RuntimeHealth, error)
|
||||
dumpFn func(string) error
|
||||
}
|
||||
|
||||
func (f fakeRuntime) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
return f.collectFn(exportDir)
|
||||
}
|
||||
|
||||
func (f fakeRuntime) CaptureTechnicalDump(baseDir string) error {
|
||||
if f.dumpFn != nil {
|
||||
return f.dumpFn(baseDir)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeTools struct {
|
||||
tailFileFn func(string, int) string
|
||||
checkToolsFn func([]string) []platform.ToolStatus
|
||||
@@ -80,23 +120,105 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
}
|
||||
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir string, _ int, _ []int, _ func(string)) (string, error) {
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaStressFn != nil {
|
||||
return f.runNvidiaStressFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
if f.listNvidiaGPUsFn != nil {
|
||||
return f.listNvidiaGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||
if f.runCPUFn != nil {
|
||||
return f.runCPUFn(baseDir, durationSec)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) DetectGPUVendor() string {
|
||||
if f.detectVendorFn != nil {
|
||||
return f.detectVendorFn()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
if f.listAMDGPUsFn != nil {
|
||||
return f.listAMDGPUsFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
if f.runAMDPackFn != nil {
|
||||
return f.runAMDPackFn(baseDir)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (f fakeSAT) RunMemoryStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (f fakeSAT) RunSATStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStressOptions) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -110,6 +232,9 @@ func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
},
|
||||
defaultRouteFn: func() string { return "10.0.0.1" },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.NetworkStatus()
|
||||
@@ -138,6 +263,9 @@ func TestNetworkStatusHandlesNoInterfaces(t *testing.T) {
|
||||
listInterfacesFn: func() ([]platform.InterfaceInfo, error) { return nil, nil },
|
||||
defaultRouteFn: func() string { return "" },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.NetworkStatus()
|
||||
@@ -159,6 +287,9 @@ func TestNetworkStatusPropagatesListError(t *testing.T) {
|
||||
},
|
||||
defaultRouteFn: func() string { return "" },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.NetworkStatus()
|
||||
@@ -183,6 +314,9 @@ func TestParseStaticIPv4ConfigAndDefaults(t *testing.T) {
|
||||
dhcpAllFn: func() (string, error) { return "", nil },
|
||||
setStaticIPv4Fn: func(platform.StaticIPv4Config) (string, error) { return "", nil },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
defaults := a.DefaultStaticIPv4FormFields("eth0")
|
||||
@@ -219,6 +353,9 @@ func TestServiceActionResults(t *testing.T) {
|
||||
return string(action) + " ok", nil
|
||||
},
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
statusResult, err := a.ServiceStatusResult("bee-audit")
|
||||
@@ -301,6 +438,11 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
runMemoryFn: func(string) (string, error) { return "", nil },
|
||||
runStorageFn: func(string) (string, error) { return "", nil },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) {
|
||||
return schema.RuntimeHealth{Status: "PARTIAL", ExportDir: "/tmp/export"}, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if got, _ := a.DHCPOneResult("eth0"); got.Body != "DHCP completed." {
|
||||
@@ -327,14 +469,87 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
if got, _ := a.RunNvidiaAcceptancePackResult(""); got.Body != "Archive written." {
|
||||
t.Fatalf("sat body=%q", got.Body)
|
||||
}
|
||||
if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "Archive written." {
|
||||
if got, _ := a.RunMemoryAcceptancePackResult(""); got.Body != "No output produced." {
|
||||
t.Fatalf("memory sat body=%q", got.Body)
|
||||
}
|
||||
if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "Archive written." {
|
||||
if got, _ := a.RunStorageAcceptancePackResult(""); got.Body != "No output produced." {
|
||||
t.Fatalf("storage sat body=%q", got.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(src string, target platform.RemovableTarget) (string, error) {
|
||||
if filepath.Base(src) == "" {
|
||||
t.Fatalf("expected non-empty source path")
|
||||
}
|
||||
return "/media/bee/" + filepath.Base(src), nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sdb1"})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportSupportBundleResult error: %v", err)
|
||||
}
|
||||
if result.Title != "Export support bundle" {
|
||||
t.Fatalf("title=%q want %q", result.Title, "Export support bundle")
|
||||
}
|
||||
if want := "USB target unmounted and safe to remove."; !contains(result.Body, want) {
|
||||
t.Fatalf("body missing %q\nbody=%s", want, result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
t.Cleanup(func() { DefaultExportDir = oldExportDir })
|
||||
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.json"), []byte("{}\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.json: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(tmp, "bee-audit.log"), []byte("audit ok\n"), 0644); err != nil {
|
||||
t.Fatalf("write bee-audit.log: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
exports: fakeExports{
|
||||
exportToTargetFn: func(string, platform.RemovableTarget) (string, error) {
|
||||
return "", errors.New("mount /dev/sda1: exFAT support is missing in this ISO build")
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.ExportSupportBundleResult(platform.RemovableTarget{Device: "/dev/sda1", FSType: "exfat"})
|
||||
if err == nil {
|
||||
t.Fatal("expected export error")
|
||||
}
|
||||
if contains(result.Body, "exported to") {
|
||||
t.Fatalf("body should not claim success:\n%s", result.Body)
|
||||
}
|
||||
if result.Body != "Support bundle export failed." {
|
||||
t.Fatalf("body=%q want %q", result.Body, "Support bundle export failed.")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -349,6 +564,9 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
runMemoryFn: func(string) (string, error) { return "", nil },
|
||||
runStorageFn: func(string) (string, error) { return "", nil },
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
result, err := a.RunNvidiaAcceptancePackResult("/tmp/sat")
|
||||
@@ -360,6 +578,50 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
runNvidiaFn: func(baseDir string) (string, error) {
|
||||
if baseDir != "/tmp/export/bee-sat" {
|
||||
t.Fatalf("nvidia baseDir=%q", baseDir)
|
||||
}
|
||||
return "", nil
|
||||
},
|
||||
runMemoryFn: func(baseDir string) (string, error) {
|
||||
if baseDir != "/tmp/export/bee-sat" {
|
||||
t.Fatalf("memory baseDir=%q", baseDir)
|
||||
}
|
||||
return "", nil
|
||||
},
|
||||
runStorageFn: func(baseDir string) (string, error) {
|
||||
if baseDir != "/tmp/export/bee-sat" {
|
||||
t.Fatalf("storage baseDir=%q", baseDir)
|
||||
}
|
||||
return "", nil
|
||||
},
|
||||
},
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(string) (schema.RuntimeHealth, error) { return schema.RuntimeHealth{}, nil },
|
||||
},
|
||||
}
|
||||
|
||||
if _, err := a.RunNvidiaAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunMemoryAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if _, err := a.RunStorageAcceptancePack("", nil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatSATSummary(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -398,6 +660,69 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
exportDir := filepath.Join(tmp, "export")
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
archive, err := BuildSupportBundle(exportDir)
|
||||
if err != nil {
|
||||
t.Fatalf("BuildSupportBundle error: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(archive); err != nil {
|
||||
t.Fatalf("archive stat: %v", err)
|
||||
}
|
||||
|
||||
file, err := os.Open(archive)
|
||||
if err != nil {
|
||||
t.Fatalf("open archive: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gzr, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
t.Fatalf("gzip reader: %v", err)
|
||||
}
|
||||
defer gzr.Close()
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
for _, name := range names {
|
||||
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||
foundRaw = true
|
||||
}
|
||||
if contains(name, "/export/bee-sat/memory-run.tar.gz") {
|
||||
t.Fatalf("support bundle should not contain nested SAT archive: %s", name)
|
||||
}
|
||||
}
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldAuditPath := DefaultAuditJSONPath
|
||||
@@ -472,6 +797,44 @@ func TestMainBanner(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeHealthResultUsesAMDLabels(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldRuntimePath := DefaultRuntimeJSONPath
|
||||
DefaultRuntimeJSONPath = filepath.Join(tmp, "runtime-health.json")
|
||||
t.Cleanup(func() { DefaultRuntimeJSONPath = oldRuntimePath })
|
||||
|
||||
raw, err := json.Marshal(schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: "/appdata/bee/export",
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
NetworkStatus: "OK",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal runtime health: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(DefaultRuntimeJSONPath, raw, 0644); err != nil {
|
||||
t.Fatalf("write runtime health: %v", err)
|
||||
}
|
||||
|
||||
a := &App{
|
||||
sat: fakeSAT{
|
||||
detectVendorFn: func() string { return "amd" },
|
||||
},
|
||||
}
|
||||
|
||||
result := a.RuntimeHealthResult()
|
||||
if !contains(result.Body, "AMDGPU ready: true") {
|
||||
t.Fatalf("body missing AMD driver label:\n%s", result.Body)
|
||||
}
|
||||
if !contains(result.Body, "ROCm SMI ready: true") {
|
||||
t.Fatalf("body missing ROCm label:\n%s", result.Body)
|
||||
}
|
||||
if contains(result.Body, "CUDA ready") {
|
||||
t.Fatalf("body should not mention CUDA on AMD:\n%s", result.Body)
|
||||
}
|
||||
}
|
||||
|
||||
func intPtr(v int) *int { return &v }
|
||||
|
||||
func contains(haystack, needle string) bool {
|
||||
|
||||
218
audit/internal/app/sat_overlay.go
Normal file
218
audit/internal/app/sat_overlay.go
Normal file
@@ -0,0 +1,218 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "amd", summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok {
|
||||
applyCPUSAT(snap.CPUs, summary)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
runAtUTC string
|
||||
overall string
|
||||
kv map[string]string
|
||||
}
|
||||
|
||||
func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return satSummary{}, false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
return satSummary{}, false
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
return satSummary{
|
||||
runAtUTC: strings.TrimSpace(kv["run_at_utc"]),
|
||||
overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])),
|
||||
kv: kv,
|
||||
}, true
|
||||
}
|
||||
|
||||
func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if !matchesGPUVendor(devs[i], vendor) {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "memory SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range dimms {
|
||||
mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) {
|
||||
status, description, ok := satSummaryStatus(summary, "CPU SAT")
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range cpus {
|
||||
mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) {
|
||||
byDevice := parseStorageSATStatus(summary)
|
||||
for i := range disks {
|
||||
devPath, _ := disks[i].Telemetry["linux_device"].(string)
|
||||
devName := filepath.Base(strings.TrimSpace(devPath))
|
||||
if devName == "" {
|
||||
continue
|
||||
}
|
||||
result, ok := byDevice[devName]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description)
|
||||
}
|
||||
}
|
||||
|
||||
type satStatusResult struct {
|
||||
status string
|
||||
description string
|
||||
ok bool
|
||||
}
|
||||
|
||||
func parseStorageSATStatus(summary satSummary) map[string]satStatusResult {
|
||||
result := map[string]satStatusResult{}
|
||||
for key, value := range summary.kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
step := strings.ReplaceAll(base[idx+1:], "_", "-")
|
||||
stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
current := result[devName]
|
||||
if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) {
|
||||
result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func satSummaryStatus(summary satSummary, label string) (string, string, bool) {
|
||||
return satKeyStatus(summary.overall, label)
|
||||
}
|
||||
|
||||
func satKeyStatus(rawStatus, label string) (string, string, bool) {
|
||||
switch strings.ToUpper(strings.TrimSpace(rawStatus)) {
|
||||
case "OK":
|
||||
// No error description on success — error_description is for problems only.
|
||||
return "OK", "", true
|
||||
case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED":
|
||||
// Tool couldn't run or test was incomplete — we can't assert hardware health.
|
||||
return "Unknown", "", true
|
||||
case "FAILED":
|
||||
return "Critical", label + " failed", true
|
||||
default:
|
||||
return "", "", false
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
if strings.TrimSpace(description) != "" {
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
}
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
case "Unknown":
|
||||
return 1 // same as OK — does not override OK from another source
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
}
|
||||
return *v
|
||||
}
|
||||
|
||||
func appStringPtr(value string) *string {
|
||||
return &value
|
||||
}
|
||||
61
audit/internal/app/sat_overlay_test.go
Normal file
61
audit/internal/app/sat_overlay_test.go
Normal file
@@ -0,0 +1,61 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "storage-20260325-161151")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:11:51Z\nnvme0n1_nvme_smart_log_status=OK\nsda_smartctl_health_status=FAILED\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
nvme := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/nvme0n1"}}
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
}
|
||||
if snap.Storage[1].Status == nil || *snap.Storage[1].Status != "Critical" {
|
||||
t.Fatalf("sda status=%v want Critical", snap.Storage[1].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-amd-20260325-161436")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
raw := "run_at_utc=2026-03-25T16:14:36Z\noverall_status=FAILED\n"
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
364
audit/internal/app/support_bundle.go
Normal file
364
audit/internal/app/support_bundle.go
Normal file
@@ -0,0 +1,364 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var supportBundleServices = []string{
|
||||
"bee-audit.service",
|
||||
"bee-web.service",
|
||||
"bee-network.service",
|
||||
"bee-nvidia.service",
|
||||
"bee-preflight.service",
|
||||
"bee-sshsetup.service",
|
||||
}
|
||||
|
||||
var supportBundleCommands = []struct {
|
||||
name string
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||
}
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
exportDir = DefaultExportDir
|
||||
}
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := cleanupOldSupportBundles(os.TempDir()); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
host := sanitizeFilename(hostnameOr("unknown"))
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.RemoveAll(stageRoot)
|
||||
|
||||
if err := copyExportDirForSupportBundle(exportDir, filepath.Join(stageRoot, "export")); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := writeJournalDump(filepath.Join(stageRoot, "systemd", "combined.journal.log")); err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, svc := range supportBundleServices {
|
||||
if err := writeCommandOutput(filepath.Join(stageRoot, "systemd", svc+".status.txt"), []string{"systemctl", "status", svc, "--no-pager"}); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := writeCommandOutput(filepath.Join(stageRoot, "systemd", svc+".journal.log"), []string{"journalctl", "--no-pager", "-u", svc}); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleCommands {
|
||||
if err := writeCommandOutput(filepath.Join(stageRoot, item.name), item.cmd); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func cleanupOldSupportBundles(dir string) error {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
type entry struct {
|
||||
path string
|
||||
mod time.Time
|
||||
}
|
||||
list := make([]entry, 0, len(matches))
|
||||
for _, match := range matches {
|
||||
info, err := os.Stat(match)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
||||
_ = os.Remove(match)
|
||||
continue
|
||||
}
|
||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
||||
}
|
||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
||||
if len(list) > 3 {
|
||||
for _, old := range list[3:] {
|
||||
_ = os.Remove(old.path)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeJournalDump(dst string) error {
|
||||
args := []string{"--no-pager"}
|
||||
for _, svc := range supportBundleServices {
|
||||
args = append(args, "-u", svc)
|
||||
}
|
||||
raw, err := exec.Command("journalctl", args...).CombinedOutput()
|
||||
if len(raw) == 0 && err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
}
|
||||
if len(raw) == 0 {
|
||||
raw = []byte("no journal output\n")
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func writeCommandOutput(dst string, cmd []string) error {
|
||||
if len(cmd) == 0 {
|
||||
return nil
|
||||
}
|
||||
raw, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
if len(raw) == 0 {
|
||||
if err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
} else {
|
||||
raw = []byte("no output\n")
|
||||
}
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "bee_version=%s\n", buildVersion())
|
||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||
fmt.Fprintf(&body, "\nfiles:\n")
|
||||
|
||||
var files []string
|
||||
if err := filepath.Walk(stageRoot, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil || info.IsDir() {
|
||||
return err
|
||||
}
|
||||
if filepath.Clean(path) == filepath.Clean(dst) {
|
||||
return nil
|
||||
}
|
||||
rel, err := filepath.Rel(stageRoot, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
files = append(files, fmt.Sprintf("%s\t%d", rel, info.Size()))
|
||||
return nil
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
sort.Strings(files)
|
||||
for _, line := range files {
|
||||
body.WriteString(line)
|
||||
body.WriteByte('\n')
|
||||
}
|
||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||
}
|
||||
|
||||
func buildVersion() string {
|
||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
return strings.TrimSpace(string(raw))
|
||||
}
|
||||
|
||||
func copyDirContents(srcDir, dstDir string) error {
|
||||
entries, err := os.ReadDir(srcDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
src := filepath.Join(srcDir, entry.Name())
|
||||
dst := filepath.Join(dstDir, entry.Name())
|
||||
if err := copyPath(src, dst); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(cleanRel, "bee-sat/") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
if strings.HasPrefix(filepath.Base(cleanRel), "bee-support-") && strings.HasSuffix(cleanRel, ".tar.gz") {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
entries, err := os.ReadDir(srcDir)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
src := filepath.Join(srcDir, entry.Name())
|
||||
dst := filepath.Join(dstDir, entry.Name())
|
||||
if err := copyPathFiltered(srcDir, src, dst, keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyPath(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||
return err
|
||||
}
|
||||
entries, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if err := copyPath(filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
|
||||
out, err := os.OpenFile(dst, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, info.Mode().Perm())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func copyPathFiltered(rootSrc, src, dst string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(rootSrc, src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if keep != nil && !keep(rel, info) {
|
||||
return nil
|
||||
}
|
||||
if info.IsDir() {
|
||||
if err := os.MkdirAll(dst, info.Mode().Perm()); err != nil {
|
||||
return err
|
||||
}
|
||||
entries, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
if err := copyPathFiltered(rootSrc, filepath.Join(src, entry.Name()), filepath.Join(dst, entry.Name()), keep); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return copyPath(src, dst)
|
||||
}
|
||||
|
||||
func createSupportTarGz(dst, srcDir string) error {
|
||||
file, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gz := gzip.NewWriter(file)
|
||||
defer gz.Close()
|
||||
|
||||
tw := tar.NewWriter(gz)
|
||||
defer tw.Close()
|
||||
|
||||
base := filepath.Dir(srcDir)
|
||||
return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
header, err := tar.FileInfoHeader(info, "")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
header.Name, err = filepath.Rel(base, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := tw.WriteHeader(header); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
_, err = io.Copy(tw, f)
|
||||
return err
|
||||
})
|
||||
}
|
||||
252
audit/internal/collector/amdgpu.go
Normal file
252
audit/internal/collector/amdgpu.go
Normal file
@@ -0,0 +1,252 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
var (
|
||||
amdSMIExecCommand = exec.Command
|
||||
amdSMILookPath = exec.LookPath
|
||||
amdSMIGlob = filepath.Glob
|
||||
)
|
||||
|
||||
var amdSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
"/usr/local/bin/rocm-smi",
|
||||
}
|
||||
|
||||
type amdGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
Product string
|
||||
Firmware string
|
||||
PowerW *float64
|
||||
TempC *float64
|
||||
}
|
||||
|
||||
func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
if !hasAMDGPUDevices(devs) {
|
||||
return devs
|
||||
}
|
||||
infoByBDF, err := queryAMDGPUs()
|
||||
if err != nil {
|
||||
slog.Info("amdgpu: enrichment skipped", "err", err)
|
||||
return devs
|
||||
}
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(info.Serial) != "" {
|
||||
devs[i].SerialNumber = &info.Serial
|
||||
}
|
||||
if strings.TrimSpace(info.Firmware) != "" {
|
||||
devs[i].Firmware = &info.Firmware
|
||||
}
|
||||
if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil {
|
||||
devs[i].Model = &info.Product
|
||||
}
|
||||
if info.PowerW != nil {
|
||||
devs[i].PowerW = info.PowerW
|
||||
}
|
||||
if info.TempC != nil {
|
||||
devs[i].TemperatureC = info.TempC
|
||||
}
|
||||
enriched++
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("amdgpu: enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
for _, dev := range devs {
|
||||
if isAMDGPUDevice(dev) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
busByCard, err := queryAMDField("--showbus")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
infoByCard := map[string]amdGPUInfo{}
|
||||
for card, bus := range busByCard {
|
||||
bdf := normalizePCIeBDF(bus)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
infoByCard[card] = amdGPUInfo{BDF: bdf}
|
||||
}
|
||||
if len(infoByCard) == 0 {
|
||||
return map[string]amdGPUInfo{}, nil
|
||||
}
|
||||
mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value })
|
||||
mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value })
|
||||
mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value })
|
||||
mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value })
|
||||
mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value })
|
||||
|
||||
result := make(map[string]amdGPUInfo, len(infoByCard))
|
||||
for _, info := range infoByCard {
|
||||
if info.BDF == "" {
|
||||
continue
|
||||
}
|
||||
result[info.BDF] = info
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) {
|
||||
values, err := queryAMDNumericField(flag)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
for card, value := range values {
|
||||
info, ok := infoByCard[card]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
apply(&info, value)
|
||||
infoByCard[card] = info
|
||||
}
|
||||
}
|
||||
|
||||
func queryAMDField(flag string) (map[string]string, error) {
|
||||
cmd, err := resolveAMDSMICmd(flag, "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parseROCmSingleValueCSV(string(out)), nil
|
||||
}
|
||||
|
||||
func queryAMDNumericField(flag string) (map[string]float64, error) {
|
||||
values, err := queryAMDField(flag)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := map[string]float64{}
|
||||
for card, raw := range values {
|
||||
if value, ok := firstFloat(raw); ok {
|
||||
out[card] = value
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func resolveAMDSMICmd(args ...string) ([]string, error) {
|
||||
if path, err := amdSMILookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
for _, pattern := range amdSMIExecutableGlobs {
|
||||
matches, err := amdSMIGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
return append([]string{match}, args...), nil
|
||||
}
|
||||
}
|
||||
return nil, exec.ErrNotFound
|
||||
}
|
||||
|
||||
func parseROCmSingleValueCSV(raw string) map[string]string {
|
||||
rows := map[string]string{}
|
||||
reader := csv.NewReader(strings.NewReader(raw))
|
||||
reader.FieldsPerRecord = -1
|
||||
records, err := reader.ReadAll()
|
||||
if err != nil {
|
||||
return rows
|
||||
}
|
||||
for _, rec := range records {
|
||||
if len(rec) < 2 {
|
||||
continue
|
||||
}
|
||||
card := normalizeROCmCardKey(rec[0])
|
||||
if card == "" {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Join(rec[1:], ","))
|
||||
if value == "" || looksLikeCSVHeaderValue(value) {
|
||||
continue
|
||||
}
|
||||
rows[card] = value
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
||||
func normalizeROCmCardKey(raw string) string {
|
||||
raw = strings.ToLower(strings.TrimSpace(raw))
|
||||
raw = strings.Trim(raw, "\"")
|
||||
if raw == "" {
|
||||
return ""
|
||||
}
|
||||
if raw == "device" || raw == "gpu" || raw == "card" {
|
||||
return ""
|
||||
}
|
||||
if strings.HasPrefix(raw, "card") {
|
||||
return raw
|
||||
}
|
||||
if _, err := strconv.Atoi(raw); err == nil {
|
||||
return "card" + raw
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeCSVHeaderValue(value string) bool {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
return strings.Contains(value, "product") ||
|
||||
strings.Contains(value, "serial") ||
|
||||
strings.Contains(value, "vbios") ||
|
||||
strings.Contains(value, "bus")
|
||||
}
|
||||
56
audit/internal/collector/amdgpu_test.go
Normal file
56
audit/internal/collector/amdgpu_test.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseROCmSingleValueCSV(t *testing.T) {
|
||||
raw := "device,Serial Number\ncard0,ABC123\ncard1,XYZ789\n"
|
||||
got := parseROCmSingleValueCSV(raw)
|
||||
if got["card0"] != "ABC123" {
|
||||
t.Fatalf("card0=%q want ABC123", got["card0"])
|
||||
}
|
||||
if got["card1"] != "XYZ789" {
|
||||
t.Fatalf("card1=%q want XYZ789", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestQueryAMDNumericFieldParsesUnits(t *testing.T) {
|
||||
origExec := amdSMIExecCommand
|
||||
origLookPath := amdSMILookPath
|
||||
t.Cleanup(func() {
|
||||
amdSMIExecCommand = origExec
|
||||
amdSMILookPath = origLookPath
|
||||
})
|
||||
|
||||
amdSMILookPath = func(string) (string, error) { return "/usr/bin/rocm-smi", nil }
|
||||
amdSMIExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "printf 'device,Temperature\\ncard0,45.5c\\ncard1,67.0c\\n'")
|
||||
}
|
||||
|
||||
got, err := queryAMDNumericField("--showtemp")
|
||||
if err != nil {
|
||||
t.Fatalf("queryAMDNumericField: %v", err)
|
||||
}
|
||||
if got["card0"] != 45.5 {
|
||||
t.Fatalf("card0=%v want 45.5", got["card0"])
|
||||
}
|
||||
if got["card1"] != 67.0 {
|
||||
t.Fatalf("card1=%v want 67.0", got["card1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeROCmCardKey(t *testing.T) {
|
||||
tests := map[string]string{
|
||||
"0": "card0",
|
||||
"card1": "card1",
|
||||
"Device": "",
|
||||
"": "",
|
||||
}
|
||||
for input, want := range tests {
|
||||
if got := normalizeROCmCardKey(input); got != want {
|
||||
t.Fatalf("normalizeROCmCardKey(%q)=%q want %q", input, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
@@ -16,6 +17,14 @@ var execDmidecode = func(typeNum string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -69,6 +78,45 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
return board
|
||||
}
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
slog.Info("bmc: collected", "version", version)
|
||||
return []schema.HardwareFirmwareRecord{
|
||||
{DeviceName: "BMC", Version: version},
|
||||
}
|
||||
}
|
||||
|
||||
// parseBMCFirmwareRevision extracts the "Firmware Revision" field from ipmitool mc info output.
|
||||
func parseBMCFirmwareRevision(out string) string {
|
||||
for _, line := range strings.Split(out, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ":")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Firmware Revision" {
|
||||
return strings.TrimSpace(val)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseBIOSFirmware extracts BIOS version from dmidecode type 0 output.
|
||||
func parseBIOSFirmware(type0 string) []schema.HardwareFirmwareRecord {
|
||||
fields := parseDMIFields(type0, "BIOS Information")
|
||||
|
||||
@@ -23,8 +23,9 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
|
||||
snap.CPUs = collectCPUs(snap.Board.SerialNumber)
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
snap.Memory = collectMemory()
|
||||
sensorDoc, err := readSensorsJSONDoc()
|
||||
@@ -35,7 +36,9 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
|
||||
@@ -3,7 +3,6 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
@@ -12,14 +11,14 @@ import (
|
||||
)
|
||||
|
||||
// collectCPUs runs dmidecode -t 4 and enriches CPUs with microcode from sysfs.
|
||||
func collectCPUs(boardSerial string) []schema.HardwareCPU {
|
||||
func collectCPUs() []schema.HardwareCPU {
|
||||
out, err := runDmidecode("4")
|
||||
if err != nil {
|
||||
slog.Warn("cpu: dmidecode type 4 failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
cpus := parseCPUs(out, boardSerial)
|
||||
cpus := parseCPUs(out)
|
||||
if mc := readMicrocode(); mc != "" {
|
||||
for i := range cpus {
|
||||
cpus[i].Firmware = &mc
|
||||
@@ -31,12 +30,12 @@ func collectCPUs(boardSerial string) []schema.HardwareCPU {
|
||||
}
|
||||
|
||||
// parseCPUs splits dmidecode output into per-processor sections and parses each.
|
||||
func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
||||
func parseCPUs(output string) []schema.HardwareCPU {
|
||||
sections := splitDMISections(output, "Processor Information")
|
||||
cpus := make([]schema.HardwareCPU, 0, len(sections))
|
||||
|
||||
for _, section := range sections {
|
||||
cpu, ok := parseCPUSection(section, boardSerial)
|
||||
cpu, ok := parseCPUSection(section)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
@@ -47,7 +46,7 @@ func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
||||
|
||||
// parseCPUSection parses one "Processor Information" block into a HardwareCPU.
|
||||
// Returns false if the socket is unpopulated.
|
||||
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
|
||||
func parseCPUSection(fields map[string]string) (schema.HardwareCPU, bool) {
|
||||
status := parseCPUStatus(fields["Status"])
|
||||
if status == statusEmpty {
|
||||
return schema.HardwareCPU{}, false
|
||||
@@ -70,11 +69,6 @@ func parseCPUSection(fields map[string]string, boardSerial string) (schema.Hardw
|
||||
}
|
||||
if v := cleanDMIValue(fields["Serial Number"]); v != "" {
|
||||
cpu.SerialNumber = &v
|
||||
} else if boardSerial != "" && cpu.Socket != nil {
|
||||
// Intel Xeon never exposes serial via DMI — generate stable fallback
|
||||
// matching core's generateCPUVendorSerial() logic
|
||||
fb := fmt.Sprintf("%s-CPU-%d", boardSerial, *cpu.Socket)
|
||||
cpu.SerialNumber = &fb
|
||||
}
|
||||
|
||||
if v := parseMHz(fields["Max Speed"]); v > 0 {
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
|
||||
func TestParseCPUs_dual_socket(t *testing.T) {
|
||||
out := mustReadFile(t, "testdata/dmidecode_type4.txt")
|
||||
cpus := parseCPUs(out, "CAR315KA0803B90")
|
||||
cpus := parseCPUs(out)
|
||||
|
||||
if len(cpus) != 2 {
|
||||
t.Fatalf("expected 2 CPUs, got %d", len(cpus))
|
||||
@@ -39,23 +39,22 @@ func TestParseCPUs_dual_socket(t *testing.T) {
|
||||
if cpu0.Status == nil || *cpu0.Status != "OK" {
|
||||
t.Errorf("cpu0 status: got %v, want OK", cpu0.Status)
|
||||
}
|
||||
// Intel Xeon serial not available → fallback
|
||||
if cpu0.SerialNumber == nil || *cpu0.SerialNumber != "CAR315KA0803B90-CPU-0" {
|
||||
t.Errorf("cpu0 serial fallback: got %v, want CAR315KA0803B90-CPU-0", cpu0.SerialNumber)
|
||||
if cpu0.SerialNumber != nil {
|
||||
t.Errorf("cpu0 serial should stay nil without source data, got %v", cpu0.SerialNumber)
|
||||
}
|
||||
|
||||
cpu1 := cpus[1]
|
||||
if cpu1.Socket == nil || *cpu1.Socket != 1 {
|
||||
t.Errorf("cpu1 socket: got %v, want 1", cpu1.Socket)
|
||||
}
|
||||
if cpu1.SerialNumber == nil || *cpu1.SerialNumber != "CAR315KA0803B90-CPU-1" {
|
||||
t.Errorf("cpu1 serial fallback: got %v, want CAR315KA0803B90-CPU-1", cpu1.SerialNumber)
|
||||
if cpu1.SerialNumber != nil {
|
||||
t.Errorf("cpu1 serial should stay nil without source data, got %v", cpu1.SerialNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCPUs_unpopulated_skipped(t *testing.T) {
|
||||
out := mustReadFile(t, "testdata/dmidecode_type4_disabled.txt")
|
||||
cpus := parseCPUs(out, "BOARD-001")
|
||||
cpus := parseCPUs(out)
|
||||
|
||||
if len(cpus) != 1 {
|
||||
t.Fatalf("expected 1 CPU (unpopulated skipped), got %d", len(cpus))
|
||||
@@ -87,7 +86,7 @@ func TestCollectCPUsSetsFirmwareFromMicrocode(t *testing.T) {
|
||||
}
|
||||
t.Cleanup(func() { execDmidecode = origRun })
|
||||
|
||||
cpus := collectCPUs("CAR315KA0803B90")
|
||||
cpus := collectCPUs()
|
||||
if len(cpus) != 2 {
|
||||
t.Fatalf("expected 2 CPUs, got %d", len(cpus))
|
||||
}
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
)
|
||||
import "bee/audit/internal/schema"
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
@@ -11,7 +8,6 @@ func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
deduplicateComponentSerials(snap)
|
||||
}
|
||||
|
||||
func filterMemory(dimms []schema.HardwareMemory) []schema.HardwareMemory {
|
||||
@@ -45,7 +41,18 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
|
||||
hasIdentity := false
|
||||
switch {
|
||||
case psu.SerialNumber != nil && *psu.SerialNumber != "":
|
||||
hasIdentity = true
|
||||
case psu.Slot != nil && *psu.Slot != "":
|
||||
hasIdentity = true
|
||||
case psu.Model != nil && *psu.Model != "":
|
||||
hasIdentity = true
|
||||
case psu.Vendor != nil && *psu.Vendor != "":
|
||||
hasIdentity = true
|
||||
}
|
||||
if !hasIdentity {
|
||||
continue
|
||||
}
|
||||
out = append(out, psu)
|
||||
@@ -79,101 +86,3 @@ func setStatusCheckedAt(status *schema.HardwareComponentStatus, collectedAt stri
|
||||
status.StatusCheckedAt = &collectedAt
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateComponentSerials(snap *schema.HardwareSnapshot) {
|
||||
deduplicateCPUSerials(snap.CPUs)
|
||||
deduplicateMemorySerials(snap.Memory)
|
||||
deduplicateStorageSerials(snap.Storage)
|
||||
deduplicatePCIeSerials(snap.PCIeDevices)
|
||||
deduplicatePSUSerials(snap.PowerSupplies)
|
||||
}
|
||||
|
||||
func deduplicateCPUSerials(items []schema.HardwareCPU) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateMemorySerials(items []schema.HardwareMemory) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].PartNumber)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateStorageSerials(items []schema.HardwareStorage) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicatePCIeSerials(items []schema.HardwarePCIeDevice) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicatePSUSerials(items []schema.HardwarePowerSupply) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,7 +39,7 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
|
||||
func TestFinalizeSnapshotPreservesDuplicateSerials(t *testing.T) {
|
||||
collectedAt := "2026-03-15T12:00:00Z"
|
||||
status := statusOK
|
||||
model := "Device"
|
||||
@@ -57,7 +57,24 @@ func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
|
||||
if got := *snap.Storage[0].SerialNumber; got != serial {
|
||||
t.Fatalf("first serial changed: %q", got)
|
||||
}
|
||||
if got := *snap.Storage[1].SerialNumber; got != "NO_SN-00000001" {
|
||||
t.Fatalf("duplicate serial mismatch: %q", got)
|
||||
if got := *snap.Storage[1].SerialNumber; got != serial {
|
||||
t.Fatalf("duplicate serial should stay unchanged: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFilterPSUsKeepsSlotOnlyEntries(t *testing.T) {
|
||||
slot := "0"
|
||||
status := statusOK
|
||||
|
||||
got := filterPSUs([]schema.HardwarePowerSupply{
|
||||
{Slot: &slot, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
})
|
||||
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||
t.Fatalf("unexpected kept PSU: %+v", got[0])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,11 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
iface := ifaces[0]
|
||||
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
|
||||
if devs[i].SerialNumber == nil {
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
}
|
||||
}
|
||||
|
||||
if devs[i].Firmware == nil {
|
||||
if out, err := ethtoolInfoQuery(iface); err == nil {
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseSFPDOM(t *testing.T) {
|
||||
raw := `
|
||||
@@ -29,6 +33,74 @@ func TestParseSFPDOM(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLSPCIDetailSerial(t *testing.T) {
|
||||
raw := `
|
||||
05:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6]
|
||||
Serial number: NIC-SN-12345
|
||||
`
|
||||
if got := parseLSPCIDetailSerial(raw); got != "NIC-SN-12345" {
|
||||
t.Fatalf("serial=%q want %q", got, "NIC-SN-12345")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePCIVPDSerial(t *testing.T) {
|
||||
raw := []byte{0x82, 0x05, 0x00, 'M', 'L', 'X', '5', 0x90, 0x08, 0x00, 'S', 'N', 0x08, 'M', 'T', '1', '2', '3', '4', '5', '6'}
|
||||
if got := parsePCIVPDSerial(raw); got != "MT123456" {
|
||||
t.Fatalf("serial=%q want %q", got, "MT123456")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:18:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: NIC-SN-98765\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(iface string) (string, error) {
|
||||
if iface != "eth0" {
|
||||
t.Fatalf("unexpected iface: %s", iface)
|
||||
}
|
||||
return "aa:bb:cc:dd:ee:ff", nil
|
||||
}
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "NIC-SN-98765" {
|
||||
t.Fatalf("serial=%v want NIC-SN-98765", out[0].SerialNumber)
|
||||
}
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDBMValue(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
|
||||
@@ -24,18 +24,17 @@ type nvidiaGPUInfo struct {
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
// If the driver/tool is unavailable, NVIDIA devices get Unknown status and
|
||||
// a stable serial fallback based on board serial + slot.
|
||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
||||
// If the driver/tool is unavailable, NVIDIA devices get Unknown status.
|
||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
if !hasNVIDIADevices(devs) {
|
||||
return devs
|
||||
}
|
||||
gpuByBDF, err := queryNVIDIAGPUs()
|
||||
if err != nil {
|
||||
slog.Info("nvidia: enrichment skipped", "err", err)
|
||||
return enrichPCIeWithNVIDIAData(devs, nil, boardSerial, false)
|
||||
return enrichPCIeWithNVIDIAData(devs, nil, false)
|
||||
}
|
||||
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, boardSerial, true)
|
||||
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, true)
|
||||
}
|
||||
|
||||
func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
@@ -47,7 +46,7 @@ func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, boardSerial string, driverLoaded bool) []schema.HardwarePCIeDevice {
|
||||
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, driverLoaded bool) []schema.HardwarePCIeDevice {
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !isNVIDIADevice(devs[i]) {
|
||||
@@ -55,7 +54,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
}
|
||||
|
||||
if !driverLoaded {
|
||||
setPCIeFallback(&devs[i], boardSerial)
|
||||
setPCIeFallback(&devs[i])
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -65,14 +64,12 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
}
|
||||
info, ok := gpuByBDF[bdf]
|
||||
if !ok {
|
||||
setPCIeFallback(&devs[i], boardSerial)
|
||||
setPCIeFallback(&devs[i])
|
||||
continue
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||
devs[i].SerialNumber = &v
|
||||
} else {
|
||||
setPCIeFallbackSerial(&devs[i], boardSerial)
|
||||
}
|
||||
if v := strings.TrimSpace(info.VBIOS); v != "" {
|
||||
devs[i].Firmware = &v
|
||||
@@ -213,26 +210,11 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
setPCIeFallbackSerial(dev, boardSerial)
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
status := statusUnknown
|
||||
dev.Status = &status
|
||||
}
|
||||
|
||||
func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
if strings.TrimSpace(boardSerial) == "" || dev.SerialNumber != nil {
|
||||
return
|
||||
}
|
||||
slot := "unknown"
|
||||
if dev.BDF != nil && strings.TrimSpace(*dev.BDF) != "" {
|
||||
slot = strings.TrimSpace(*dev.BDF)
|
||||
} else if dev.Slot != nil && strings.TrimSpace(*dev.Slot) != "" {
|
||||
slot = strings.TrimSpace(*dev.Slot)
|
||||
}
|
||||
fb := fmt.Sprintf("%s-PCIE-%s", boardSerial, slot)
|
||||
dev.SerialNumber = &fb
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.TemperatureC != nil {
|
||||
dev.TemperatureC = info.TemperatureC
|
||||
|
||||
@@ -73,7 +73,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
out := enrichPCIeWithNVIDIAData(devices, byBDF, "BOARD-001", true)
|
||||
out := enrichPCIeWithNVIDIAData(devices, byBDF, true)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-ABC" {
|
||||
t.Fatalf("serial: got %v", out[0].SerialNumber)
|
||||
}
|
||||
@@ -103,9 +103,9 @@ func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
out := enrichPCIeWithNVIDIAData(devices, nil, "BOARD-123", false)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
|
||||
t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
|
||||
out := enrichPCIeWithNVIDIAData(devices, nil, false)
|
||||
if out[0].SerialNumber != nil {
|
||||
t.Fatalf("serial should stay nil without source data, got %v", out[0].SerialNumber)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != statusUnknown {
|
||||
t.Fatalf("fallback status: got %v", out[0].Status)
|
||||
|
||||
@@ -37,7 +37,7 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
val := strings.TrimSpace(line[idx+2:])
|
||||
fields[key] = val
|
||||
}
|
||||
if !shouldIncludePCIeDevice(fields["Class"]) {
|
||||
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
|
||||
continue
|
||||
}
|
||||
dev := parseLspciDevice(fields)
|
||||
@@ -46,8 +46,10 @@ func parseLspci(output string) []schema.HardwarePCIeDevice {
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldIncludePCIeDevice(class string) bool {
|
||||
func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
c := strings.ToLower(strings.TrimSpace(class))
|
||||
v := strings.ToLower(strings.TrimSpace(vendor))
|
||||
d := strings.ToLower(strings.TrimSpace(device))
|
||||
if c == "" {
|
||||
return true
|
||||
}
|
||||
@@ -57,6 +59,8 @@ func shouldIncludePCIeDevice(class string) bool {
|
||||
"host bridge",
|
||||
"isa bridge",
|
||||
"pci bridge",
|
||||
"performance counter",
|
||||
"performance counters",
|
||||
"ram memory",
|
||||
"system peripheral",
|
||||
"communication controller",
|
||||
@@ -66,12 +70,28 @@ func shouldIncludePCIeDevice(class string) bool {
|
||||
"audio device",
|
||||
"serial bus controller",
|
||||
"unassigned class",
|
||||
"non-essential instrumentation",
|
||||
}
|
||||
for _, bad := range excluded {
|
||||
if strings.Contains(c, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
"reserved spp",
|
||||
"ptdma",
|
||||
"cryptographic coprocessor pspcpp",
|
||||
"pspcpp",
|
||||
}
|
||||
for _, bad := range internalAMDPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -84,6 +104,7 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// Slot is the BDF: "0000:00:02.0"
|
||||
if bdf := fields["Slot"]; bdf != "" {
|
||||
dev.Slot = &bdf
|
||||
dev.BDF = &bdf
|
||||
// parse vendor_id and device_id from sysfs
|
||||
vendorID, deviceID := readPCIIDs(bdf)
|
||||
@@ -95,6 +116,8 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
@@ -162,6 +185,18 @@ func readPCINumaNode(bdf string) (int, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
func parsePCINumaNode(raw string) (int, bool) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return 0, false
|
||||
}
|
||||
value, err := strconv.Atoi(raw)
|
||||
if err != nil || value < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
||||
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||
if err != nil {
|
||||
|
||||
@@ -1,34 +1,49 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
tests := []struct {
|
||||
class string
|
||||
want bool
|
||||
name string
|
||||
class string
|
||||
vendor string
|
||||
device string
|
||||
want bool
|
||||
}{
|
||||
{"USB controller", false},
|
||||
{"System peripheral", false},
|
||||
{"Audio device", false},
|
||||
{"Host bridge", false},
|
||||
{"PCI bridge", false},
|
||||
{"SMBus", false},
|
||||
{"Ethernet controller", true},
|
||||
{"RAID bus controller", true},
|
||||
{"Non-Volatile memory controller", true},
|
||||
{"VGA compatible controller", true},
|
||||
{name: "usb", class: "USB controller", want: false},
|
||||
{name: "system peripheral", class: "System peripheral", want: false},
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
{name: "amd dummy function", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse PTDMA", want: false},
|
||||
{name: "amd pspcpp", class: "Encryption controller", vendor: "Advanced Micro Devices, Inc. [AMD]", device: "Starship/Matisse Cryptographic Coprocessor PSPCPP", want: false},
|
||||
{name: "ethernet", class: "Ethernet controller", want: true},
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := shouldIncludePCIeDevice(tt.class)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class %q include=%v want %v", tt.class, got, tt.want)
|
||||
}
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := shouldIncludePCIeDevice(tt.class, tt.vendor, tt.device)
|
||||
if got != tt.want {
|
||||
t.Fatalf("class=%q vendor=%q device=%q include=%v want %v", tt.class, tt.vendor, tt.device, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
input := "Slot:\t0000:00:14.0\nClass:\tUSB controller\nVendor:\tIntel Corporation\nDevice:\tUSB 3.0\n\n" +
|
||||
"Slot:\t0000:00:18.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
@@ -38,6 +53,56 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VideoController" {
|
||||
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
|
||||
}
|
||||
if devs[0].Slot == nil || *devs[0].Slot != "0000:65:00.0" {
|
||||
t.Fatalf("slot: got %v", devs[0].Slot)
|
||||
}
|
||||
if devs[0].BDF == nil || *devs[0].BDF != "0000:65:00.0" {
|
||||
t.Fatalf("bdf: got %v", devs[0].BDF)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:1a:00.0\nClass:\tNon-Essential Instrumentation\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PCIe Dummy Function\n\n" +
|
||||
"Slot:\t0000:1a:00.2\nClass:\tEncryption controller\nVendor:\tAdvanced Micro Devices, Inc. [AMD]\nDevice:\tStarship/Matisse PTDMA\n\n" +
|
||||
"Slot:\t0000:05:00.0\nClass:\tEthernet controller\nVendor:\tMellanox Technologies\nDevice:\tMT28908 Family [ConnectX-6]\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "MT28908 Family [ConnectX-6]" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
data, err := json.Marshal(devs[0])
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
if !strings.Contains(text, `"slot":"0000:65:00.0"`) {
|
||||
t.Fatalf("json missing slot: %s", text)
|
||||
}
|
||||
if strings.Contains(text, `"bdf"`) {
|
||||
t.Fatalf("json should not emit bdf: %s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspciUsesNUMANodeFieldWhenSysfsUnavailable(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tEthernet controller\nVendor:\tIntel Corporation\nDevice:\tX710\nNUMANode:\t1\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].NUMANode == nil || *devs[0].NUMANode != 1 {
|
||||
t.Fatalf("numa_node=%v want 1", devs[0].NUMANode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
|
||||
123
audit/internal/collector/pcie_identity.go
Normal file
123
audit/internal/collector/pcie_identity.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("lspci", "-vv", "-s", bdf).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
readPCIVPDFile = func(bdf string) ([]byte, error) {
|
||||
return os.ReadFile(filepath.Join("/sys/bus/pci/devices", bdf, "vpd"))
|
||||
}
|
||||
)
|
||||
|
||||
func enrichPCIeWithPCISerials(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !shouldProbePCIeSerial(devs[i]) {
|
||||
continue
|
||||
}
|
||||
bdf := normalizePCIeBDF(*devs[i].BDF)
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
if serial := queryPCIDeviceSerial(bdf); serial != "" {
|
||||
devs[i].SerialNumber = &serial
|
||||
enriched++
|
||||
}
|
||||
}
|
||||
if enriched > 0 {
|
||||
slog.Info("pcie: serials enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func shouldProbePCIeSerial(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.BDF == nil || dev.SerialNumber != nil {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
return isNICClass(class) || isGPUClass(class)
|
||||
}
|
||||
|
||||
func queryPCIDeviceSerial(bdf string) string {
|
||||
if out, err := queryPCILSPCIDetail(bdf); err == nil {
|
||||
if serial := parseLSPCIDetailSerial(out); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
if raw, err := readPCIVPDFile(bdf); err == nil {
|
||||
return parsePCIVPDSerial(raw)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseLSPCIDetailSerial(raw string) string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
lower := strings.ToLower(line)
|
||||
if !strings.Contains(lower, "serial number:") {
|
||||
continue
|
||||
}
|
||||
idx := strings.Index(line, ":")
|
||||
if idx < 0 {
|
||||
continue
|
||||
}
|
||||
if serial := strings.TrimSpace(line[idx+1:]); serial != "" {
|
||||
return serial
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parsePCIVPDSerial(raw []byte) string {
|
||||
for i := 0; i+3 < len(raw); i++ {
|
||||
if raw[i] != 'S' || raw[i+1] != 'N' {
|
||||
continue
|
||||
}
|
||||
length := int(raw[i+2])
|
||||
if length <= 0 || length > 64 || i+3+length > len(raw) {
|
||||
continue
|
||||
}
|
||||
value := strings.TrimSpace(strings.Trim(string(raw[i+3:i+3+length]), "\x00"))
|
||||
if !looksLikeSerial(value) {
|
||||
continue
|
||||
}
|
||||
return value
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeSerial(value string) bool {
|
||||
if len(value) < 4 {
|
||||
return false
|
||||
}
|
||||
hasAlphaNum := false
|
||||
for _, r := range value {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9':
|
||||
hasAlphaNum = true
|
||||
case strings.ContainsRune(" -_./:", r):
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
return hasAlphaNum
|
||||
}
|
||||
47
audit/internal/collector/pcie_identity_test.go
Normal file
47
audit/internal/collector/pcie_identity_test.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEnrichPCIeWithPCISerialsAddsGPUFallback(t *testing.T) {
|
||||
origDetail := queryPCILSPCIDetail
|
||||
origVPD := readPCIVPDFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
if bdf != "0000:11:00.0" {
|
||||
t.Fatalf("unexpected bdf: %s", bdf)
|
||||
}
|
||||
return "Serial number: GPU-SN-12345\n", nil
|
||||
}
|
||||
readPCIVPDFile = func(string) ([]byte, error) {
|
||||
return nil, fmt.Errorf("no vpd needed")
|
||||
}
|
||||
|
||||
class := "DisplayController"
|
||||
bdf := "0000:11:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithPCISerials(devs)
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "GPU-SN-12345" {
|
||||
t.Fatalf("serial=%v want GPU-SN-12345", out[0].SerialNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldProbePCIeSerialSkipsNonGPUOrNIC(t *testing.T) {
|
||||
class := "StorageController"
|
||||
bdf := "0000:19:00.0"
|
||||
dev := schema.HardwarePCIeDevice{DeviceClass: &class, BDF: &bdf}
|
||||
if shouldProbePCIeSerial(dev) {
|
||||
t.Fatal("unexpected probe for storage controller")
|
||||
}
|
||||
}
|
||||
@@ -5,21 +5,31 @@ import (
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
// ipmitool requires /dev/ipmi0 — not available on non-server hardware
|
||||
out, err := exec.Command("ipmitool", "fru", "print").Output()
|
||||
if err != nil {
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
} else {
|
||||
mergePSUSDR(psus, sdrData)
|
||||
}
|
||||
} else if len(psus) == 0 {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
psus := parseFRU(string(out))
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
mergePSUSDR(psus, parsePSUSDR(string(sdrOut)))
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
return psus
|
||||
}
|
||||
@@ -79,9 +89,7 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
|
||||
// Only process PSU FRU records
|
||||
headerLower := strings.ToLower(header)
|
||||
if !strings.Contains(headerLower, "psu") &&
|
||||
!strings.Contains(headerLower, "power supply") &&
|
||||
!strings.Contains(headerLower, "power_supply") {
|
||||
if !isPSUHeader(headerLower) {
|
||||
return schema.HardwarePowerSupply{}, false
|
||||
}
|
||||
|
||||
@@ -89,21 +97,24 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
psu := schema.HardwarePowerSupply{Present: &present}
|
||||
|
||||
slotStr := strconv.Itoa(slotIdx)
|
||||
if slot, ok := parsePSUSlot(header); ok && slot > 0 {
|
||||
slotStr = strconv.Itoa(slot - 1)
|
||||
}
|
||||
psu.Slot = &slotStr
|
||||
|
||||
if v := cleanDMIValue(fields["Board Product"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Product", "Product Name", "Product Part Number"); v != "" {
|
||||
psu.Model = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Mfg"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Mfg", "Product Manufacturer", "Product Manufacturer Name"); v != "" {
|
||||
psu.Vendor = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Serial"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Serial", "Product Serial", "Product Serial Number"); v != "" {
|
||||
psu.SerialNumber = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Part Number"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Part Number", "Product Part Number", "Part Number"); v != "" {
|
||||
psu.PartNumber = &v
|
||||
}
|
||||
if v := cleanDMIValue(fields["Board Extra"]); v != "" {
|
||||
if v := firstNonEmptyField(fields, "Board Extra", "Product Version", "Board Version"); v != "" {
|
||||
psu.Firmware = &v
|
||||
}
|
||||
|
||||
@@ -120,6 +131,23 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
return psu, true
|
||||
}
|
||||
|
||||
func isPSUHeader(headerLower string) bool {
|
||||
return strings.Contains(headerLower, "psu") ||
|
||||
strings.Contains(headerLower, "pws") ||
|
||||
strings.Contains(headerLower, "power supply") ||
|
||||
strings.Contains(headerLower, "power_supply") ||
|
||||
strings.Contains(headerLower, "power module")
|
||||
}
|
||||
|
||||
func firstNonEmptyField(fields map[string]string, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
if value := cleanDMIValue(fields[key]); value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
type psuSDR struct {
|
||||
slot int
|
||||
status string
|
||||
@@ -131,7 +159,13 @@ type psuSDR struct {
|
||||
healthPct *float64
|
||||
}
|
||||
|
||||
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
|
||||
var psuSlotPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
||||
}
|
||||
|
||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
out := map[int]psuSDR{}
|
||||
@@ -164,6 +198,8 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
entry.inputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "output power"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||
entry.inputVoltage = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "temp"):
|
||||
@@ -176,6 +212,49 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
return out
|
||||
}
|
||||
|
||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||
if len(sdr) == 0 {
|
||||
return nil
|
||||
}
|
||||
slots := make([]int, 0, len(sdr))
|
||||
for slot := range sdr {
|
||||
slots = append(slots, slot)
|
||||
}
|
||||
sort.Ints(slots)
|
||||
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(slots))
|
||||
for _, slot := range slots {
|
||||
entry := sdr[slot]
|
||||
present := true
|
||||
status := entry.status
|
||||
if status == "" {
|
||||
status = statusUnknown
|
||||
}
|
||||
slotStr := strconv.Itoa(slot - 1)
|
||||
model := "PSU"
|
||||
psu := schema.HardwarePowerSupply{
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
Slot: &slotStr,
|
||||
Present: &present,
|
||||
Model: &model,
|
||||
InputPowerW: entry.inputPowerW,
|
||||
OutputPowerW: entry.outputPowerW,
|
||||
InputVoltage: entry.inputVoltage,
|
||||
TemperatureC: entry.temperatureC,
|
||||
}
|
||||
if entry.healthPct != nil {
|
||||
psu.LifeRemainingPct = entry.healthPct
|
||||
lifeUsed := 100 - *entry.healthPct
|
||||
psu.LifeUsedPct = &lifeUsed
|
||||
}
|
||||
if entry.reason != "" {
|
||||
psu.ErrorDescription = &entry.reason
|
||||
}
|
||||
out = append(out, psu)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
|
||||
for i := range psus {
|
||||
slotIdx, err := strconv.Atoi(derefPSUSlot(psus[i].Slot))
|
||||
@@ -231,17 +310,19 @@ func splitSDRFields(line string) []string {
|
||||
}
|
||||
|
||||
func parsePSUSlot(name string) (int, bool) {
|
||||
m := psuSlotRe.FindStringSubmatch(strings.ToLower(name))
|
||||
if len(m) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
for _, group := range m[1:] {
|
||||
if group == "" {
|
||||
for _, re := range psuSlotPatterns {
|
||||
m := re.FindStringSubmatch(strings.ToLower(name))
|
||||
if len(m) == 0 {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(group)
|
||||
if err == nil && n > 0 {
|
||||
return n, true
|
||||
for _, group := range m[1:] {
|
||||
if group == "" {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(group)
|
||||
if err == nil && n > 0 {
|
||||
return n, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
|
||||
@@ -38,3 +38,54 @@ PS2 Input Power | 0 Watts | cr
|
||||
t.Fatalf("ps2 status=%q", got[2].status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePSUSlotVendorVariants(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
want int
|
||||
}{
|
||||
{name: "PWS1 Status", want: 1},
|
||||
{name: "Power Supply Bay 8", want: 8},
|
||||
{name: "PS 6 Input Power", want: 6},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got, ok := parsePSUSlot(tt.name)
|
||||
if !ok || got != tt.want {
|
||||
t.Fatalf("parsePSUSlot(%q)=(%d,%v) want (%d,true)", tt.name, got, ok, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
health := 97.0
|
||||
outputPower := 915.0
|
||||
got := synthesizePSUsFromSDR(map[int]psuSDR{
|
||||
1: {
|
||||
slot: 1,
|
||||
status: statusOK,
|
||||
outputPowerW: &outputPower,
|
||||
healthPct: &health,
|
||||
},
|
||||
})
|
||||
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].Slot == nil || *got[0].Slot != "0" {
|
||||
t.Fatalf("slot=%v want 0", got[0].Slot)
|
||||
}
|
||||
if got[0].OutputPowerW == nil || *got[0].OutputPowerW != 915 {
|
||||
t.Fatalf("output power=%v", got[0].OutputPowerW)
|
||||
}
|
||||
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 97 {
|
||||
t.Fatalf("life remaining=%v", got[0].LifeRemainingPct)
|
||||
}
|
||||
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 3 {
|
||||
t.Fatalf("life used=%v", got[0].LifeUsedPct)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,19 +113,8 @@ func isLikelyPSUTemp(chip, feature string) bool {
|
||||
|
||||
func detectPSUSlot(parts ...string) (string, bool) {
|
||||
for _, part := range parts {
|
||||
lower := strings.ToLower(part)
|
||||
matches := psuSlotRe.FindStringSubmatch(lower)
|
||||
if len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
for _, group := range matches[1:] {
|
||||
if group == "" {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.Atoi(group)
|
||||
if err == nil && value > 0 {
|
||||
return strconv.Itoa(value - 1), true
|
||||
}
|
||||
if value, ok := parsePSUSlot(part); ok && value > 0 {
|
||||
return strconv.Itoa(value - 1), true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
|
||||
@@ -5,11 +5,13 @@ import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectStorage() []schema.HardwareStorage {
|
||||
devs := lsblkDevices()
|
||||
devs := discoverStorageDevices()
|
||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
var s schema.HardwareStorage
|
||||
@@ -39,6 +41,47 @@ type lsblkRoot struct {
|
||||
Blockdevices []lsblkDevice `json:"blockdevices"`
|
||||
}
|
||||
|
||||
type nvmeListRoot struct {
|
||||
Devices []nvmeListDevice `json:"Devices"`
|
||||
}
|
||||
|
||||
type nvmeListDevice struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
ModelNumber string `json:"ModelNumber"`
|
||||
SerialNumber string `json:"SerialNumber"`
|
||||
Firmware string `json:"Firmware"`
|
||||
PhysicalSize int64 `json:"PhysicalSize"`
|
||||
}
|
||||
|
||||
func discoverStorageDevices() []lsblkDevice {
|
||||
merged := map[string]lsblkDevice{}
|
||||
for _, dev := range lsblkDevices() {
|
||||
if dev.Name == "" {
|
||||
continue
|
||||
}
|
||||
merged[dev.Name] = dev
|
||||
}
|
||||
for _, dev := range nvmeListDevices() {
|
||||
if dev.Name == "" {
|
||||
continue
|
||||
}
|
||||
current := merged[dev.Name]
|
||||
merged[dev.Name] = mergeStorageDevice(current, dev)
|
||||
}
|
||||
|
||||
disks := make([]lsblkDevice, 0, len(merged))
|
||||
for _, dev := range merged {
|
||||
if dev.Type == "" {
|
||||
dev.Type = "disk"
|
||||
}
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
disks = append(disks, dev)
|
||||
}
|
||||
return disks
|
||||
}
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
@@ -60,6 +103,59 @@ func lsblkDevices() []lsblkDevice {
|
||||
return disks
|
||||
}
|
||||
|
||||
func nvmeListDevices() []lsblkDevice {
|
||||
out, err := exec.Command("nvme", "list", "-o", "json").Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var root nvmeListRoot
|
||||
if err := json.Unmarshal(out, &root); err != nil {
|
||||
slog.Warn("storage: nvme list parse failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
devices := make([]lsblkDevice, 0, len(root.Devices))
|
||||
for _, dev := range root.Devices {
|
||||
name := filepath.Base(strings.TrimSpace(dev.DevicePath))
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, lsblkDevice{
|
||||
Name: name,
|
||||
Type: "disk",
|
||||
Size: strconv.FormatInt(dev.PhysicalSize, 10),
|
||||
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||
Model: strings.TrimSpace(dev.ModelNumber),
|
||||
Tran: "nvme",
|
||||
})
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
func mergeStorageDevice(existing, incoming lsblkDevice) lsblkDevice {
|
||||
if existing.Name == "" {
|
||||
return incoming
|
||||
}
|
||||
if existing.Type == "" {
|
||||
existing.Type = incoming.Type
|
||||
}
|
||||
if strings.TrimSpace(existing.Size) == "" {
|
||||
existing.Size = incoming.Size
|
||||
}
|
||||
if strings.TrimSpace(existing.Serial) == "" {
|
||||
existing.Serial = incoming.Serial
|
||||
}
|
||||
if strings.TrimSpace(existing.Model) == "" {
|
||||
existing.Model = incoming.Model
|
||||
}
|
||||
if strings.TrimSpace(existing.Tran) == "" {
|
||||
existing.Tran = incoming.Tran
|
||||
}
|
||||
if strings.TrimSpace(existing.Hctl) == "" {
|
||||
existing.Hctl = incoming.Hctl
|
||||
}
|
||||
return existing
|
||||
}
|
||||
|
||||
// smartctlInfo is the subset of smartctl -j -a output we care about.
|
||||
type smartctlInfo struct {
|
||||
ModelFamily string `json:"model_family"`
|
||||
@@ -94,6 +190,7 @@ type smartctlInfo struct {
|
||||
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -252,9 +349,22 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Present: &present,
|
||||
Type: &devType,
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||
s.Model = &v
|
||||
}
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Serial)); v != "" {
|
||||
s.SerialNumber = &v
|
||||
}
|
||||
if size := parseStorageBytes(dev.Size); size > 0 {
|
||||
gb := int(size / 1_000_000_000)
|
||||
if gb > 0 {
|
||||
s.SizeGB = &gb
|
||||
}
|
||||
}
|
||||
|
||||
// id-ctrl: model, serial, firmware, capacity
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
@@ -335,6 +445,14 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
return s
|
||||
}
|
||||
|
||||
func parseStorageBytes(raw string) int64 {
|
||||
value, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
|
||||
if err == nil && value > 0 {
|
||||
return value
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func nvmeDataUnitsToBytes(units int64) int64 {
|
||||
if units <= 0 {
|
||||
return 0
|
||||
|
||||
33
audit/internal/collector/storage_discovery_test.go
Normal file
33
audit/internal/collector/storage_discovery_test.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := mergeStorageDevice(
|
||||
lsblkDevice{Name: "nvme0n1", Type: "disk", Tran: "nvme"},
|
||||
lsblkDevice{Name: "nvme0n1", Type: "disk", Size: "1024", Serial: "SN123", Model: "Kioxia"},
|
||||
)
|
||||
|
||||
if got.Serial != "SN123" {
|
||||
t.Fatalf("serial=%q want SN123", got.Serial)
|
||||
}
|
||||
if got.Model != "Kioxia" {
|
||||
t.Fatalf("model=%q want Kioxia", got.Model)
|
||||
}
|
||||
if got.Size != "1024" {
|
||||
t.Fatalf("size=%q want 1024", got.Size)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageBytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := parseStorageBytes(" 2048 "); got != 2048 {
|
||||
t.Fatalf("parseStorageBytes=%d want 2048", got)
|
||||
}
|
||||
if got := parseStorageBytes("1.92 TB"); got != 0 {
|
||||
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||
}
|
||||
}
|
||||
@@ -9,8 +9,50 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
var exportExecCommand = exec.Command
|
||||
|
||||
func formatMountTargetError(target RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
|
||||
func removableTargetReadOnly(fields map[string]string) bool {
|
||||
if fields["RO"] == "1" {
|
||||
return true
|
||||
}
|
||||
switch strings.ToLower(strings.TrimSpace(fields["FSTYPE"])) {
|
||||
case "iso9660", "squashfs":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func ensureWritableMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
raw, err := exec.Command("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
raw, err := exportExecCommand("lsblk", "-P", "-o", "NAME,TYPE,PKNAME,RM,RO,FSTYPE,MOUNTPOINT,SIZE,LABEL,MODEL").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -34,7 +76,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if !removable || fields["FSTYPE"] == "" {
|
||||
if !removable || fields["FSTYPE"] == "" || removableTargetReadOnly(fields) {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -52,7 +94,7 @@ func (s *System) ListRemovableTargets() ([]RemovableTarget, error) {
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string, error) {
|
||||
func (s *System) ExportFileToTarget(src string, target RemovableTarget) (dst string, retErr error) {
|
||||
if src == "" || target.Device == "" {
|
||||
return "", fmt.Errorf("source and target are required")
|
||||
}
|
||||
@@ -62,20 +104,43 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
|
||||
mountpoint := strings.TrimSpace(target.Mountpoint)
|
||||
mountedHere := false
|
||||
mounted := mountpoint != ""
|
||||
if mountpoint == "" {
|
||||
mountpoint = filepath.Join("/tmp", "bee-export-"+filepath.Base(target.Device))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if raw, err := exec.Command("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
if raw, err := exportExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
_ = os.Remove(mountpoint)
|
||||
return string(raw), err
|
||||
return "", formatMountTargetError(target, string(raw), err)
|
||||
}
|
||||
mountedHere = true
|
||||
mounted = true
|
||||
}
|
||||
defer func() {
|
||||
if !mounted {
|
||||
return
|
||||
}
|
||||
_ = exportExecCommand("sync").Run()
|
||||
if raw, err := exportExecCommand("umount", mountpoint).CombinedOutput(); err != nil && retErr == nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
retErr = err
|
||||
} else {
|
||||
retErr = fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
}
|
||||
if mountedHere {
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := ensureWritableMountpoint(mountpoint); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
filename := filepath.Base(src)
|
||||
dst := filepath.Join(mountpoint, filename)
|
||||
dst = filepath.Join(mountpoint, filename)
|
||||
data, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return "", err
|
||||
@@ -83,12 +148,6 @@ func (s *System) ExportFileToTarget(src string, target RemovableTarget) (string,
|
||||
if err := os.WriteFile(dst, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = exec.Command("sync").Run()
|
||||
|
||||
if mountedHere {
|
||||
_ = exec.Command("umount", mountpoint).Run()
|
||||
_ = os.Remove(mountpoint)
|
||||
}
|
||||
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
112
audit/internal/platform/export_test.go
Normal file
112
audit/internal/platform/export_test.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestExportFileToTargetUnmountsExistingMountpoint(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
|
||||
var calls [][]string
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
calls = append(calls, append([]string{name}, args...))
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
dst, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("ExportFileToTarget error: %v", err)
|
||||
}
|
||||
if got, want := dst, filepath.Join(mountpoint, "bundle.tar.gz"); got != want {
|
||||
t.Fatalf("dst=%q want %q", got, want)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(mountpoint, "bundle.tar.gz")); err != nil {
|
||||
t.Fatalf("exported file missing: %v", err)
|
||||
}
|
||||
|
||||
foundUmount := false
|
||||
for _, call := range calls {
|
||||
if len(call) == 2 && call[0] == "umount" && call[1] == mountpoint {
|
||||
foundUmount = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundUmount {
|
||||
t.Fatalf("expected umount %q call, got %#v", mountpoint, calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportFileToTargetRejectsNonWritableMountpoint(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
src := filepath.Join(tmp, "bundle.tar.gz")
|
||||
mountpoint := filepath.Join(tmp, "mnt")
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
t.Fatalf("mkdir mountpoint: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(src, []byte("bundle"), 0644); err != nil {
|
||||
t.Fatalf("write src: %v", err)
|
||||
}
|
||||
if err := os.Chmod(mountpoint, 0555); err != nil {
|
||||
t.Fatalf("chmod mountpoint: %v", err)
|
||||
}
|
||||
|
||||
oldExec := exportExecCommand
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "exit 0")
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
_, err := s.ExportFileToTarget(src, RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
Mountpoint: mountpoint,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-writable mountpoint")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "target filesystem is not writable") {
|
||||
t.Fatalf("err=%q want writable message", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestListRemovableTargetsSkipsReadOnlyMedia(t *testing.T) {
|
||||
oldExec := exportExecCommand
|
||||
lsblkOut := `NAME="sda1" TYPE="part" PKNAME="sda" RM="1" RO="1" FSTYPE="iso9660" MOUNTPOINT="/run/live/medium" SIZE="3.7G" LABEL="BEE" MODEL=""
|
||||
NAME="sdb1" TYPE="part" PKNAME="sdb" RM="1" RO="0" FSTYPE="vfat" MOUNTPOINT="/media/bee/USB" SIZE="29.8G" LABEL="USB" MODEL=""`
|
||||
exportExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("sh", "-c", "printf '%s\n' \"$LSBLK_OUT\"")
|
||||
cmd.Env = append(os.Environ(), "LSBLK_OUT="+lsblkOut)
|
||||
return cmd
|
||||
}
|
||||
t.Cleanup(func() { exportExecCommand = oldExec })
|
||||
|
||||
s := &System{}
|
||||
targets, err := s.ListRemovableTargets()
|
||||
if err != nil {
|
||||
t.Fatalf("ListRemovableTargets error: %v", err)
|
||||
}
|
||||
if len(targets) != 1 {
|
||||
t.Fatalf("len(targets)=%d want 1 (%+v)", len(targets), targets)
|
||||
}
|
||||
if got := targets[0].Device; got != "/dev/sdb1" {
|
||||
t.Fatalf("device=%q want /dev/sdb1", got)
|
||||
}
|
||||
}
|
||||
644
audit/internal/platform/gpu_metrics.go
Normal file
644
audit/internal/platform/gpu_metrics.go
Normal file
@@ -0,0 +1,644 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||
}
|
||||
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var rows []GPUMetricRow
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
rows = append(rows, GPUMetricRow{
|
||||
GPUIndex: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
MemUsagePct: parseGPUFloat(parts[3]),
|
||||
PowerW: parseGPUFloat(parts[4]),
|
||||
ClockMHz: parseGPUFloat(parts[5]),
|
||||
})
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
func parseGPUFloat(s string) float64 {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "N/A" || s == "[Not Supported]" || s == "" {
|
||||
return 0
|
||||
}
|
||||
v, _ := strconv.ParseFloat(s, 64)
|
||||
return v
|
||||
}
|
||||
|
||||
// SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
return sampleGPUMetrics(gpuIndices)
|
||||
}
|
||||
|
||||
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||
if len(lines) < 2 {
|
||||
return nil, fmt.Errorf("rocm-smi: insufficient output")
|
||||
}
|
||||
|
||||
// Parse header to find column indices by name.
|
||||
headers := strings.Split(lines[0], ",")
|
||||
colIdx := func(keywords ...string) int {
|
||||
for i, h := range headers {
|
||||
hl := strings.ToLower(strings.TrimSpace(h))
|
||||
for _, kw := range keywords {
|
||||
if strings.Contains(hl, kw) {
|
||||
return i
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
|
||||
idxUse := colIdx("gpu use (%)")
|
||||
idxMem := colIdx("vram%", "memory allocated")
|
||||
idxPow := colIdx("average graphics package power", "power (w)")
|
||||
|
||||
var rows []GPUMetricRow
|
||||
for _, line := range lines[1:] {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
idx := len(rows)
|
||||
row := GPUMetricRow{GPUIndex: idx}
|
||||
get := func(i int) float64 {
|
||||
if i < 0 || i >= len(parts) {
|
||||
return 0
|
||||
}
|
||||
v := strings.TrimSpace(parts[i])
|
||||
if strings.EqualFold(v, "n/a") {
|
||||
return 0
|
||||
}
|
||||
return parseGPUFloat(v)
|
||||
}
|
||||
row.TempC = get(idxTemp)
|
||||
row.UsagePct = get(idxUse)
|
||||
row.MemUsagePct = get(idxMem)
|
||||
row.PowerW = get(idxPow)
|
||||
rows = append(rows, row)
|
||||
}
|
||||
if len(rows) == 0 {
|
||||
return nil, fmt.Errorf("rocm-smi: no GPU rows parsed")
|
||||
}
|
||||
return rows, nil
|
||||
}
|
||||
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
||||
for _, r := range rows {
|
||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
// Group by GPU index preserving order.
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
var svgs strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
||||
svgs.WriteString("\n")
|
||||
}
|
||||
|
||||
ts := time.Now().UTC().Format("2006-01-02 15:04:05 UTC")
|
||||
html := fmt.Sprintf(`<!DOCTYPE html>
|
||||
<html><head>
|
||||
<meta charset="utf-8">
|
||||
<title>GPU Stress Test Metrics</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
||||
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
||||
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
||||
</style>
|
||||
</head><body>
|
||||
<h1>GPU Stress Test Metrics</h1>
|
||||
<p>Generated %s</p>
|
||||
%s
|
||||
</body></html>`, ts, svgs.String())
|
||||
|
||||
return os.WriteFile(path, []byte(html), 0644)
|
||||
}
|
||||
|
||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
// Layout
|
||||
const W, H = 960, 520
|
||||
const plotX1 = 120 // usage axis / chart left border
|
||||
const plotX2 = 840 // power axis / chart right border
|
||||
const plotY1 = 70 // top
|
||||
const plotY2 = 465 // bottom (PH = 395)
|
||||
const PW = plotX2 - plotX1
|
||||
const PH = plotY2 - plotY1
|
||||
// Outer axes
|
||||
const tempAxisX = 60 // temp axis line
|
||||
const clockAxisX = 900 // clock axis line
|
||||
|
||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||
seriesLabel := [4]string{
|
||||
fmt.Sprintf("GPU %d Temp (°C)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Usage (%%)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Power (W)", gpuIdx),
|
||||
fmt.Sprintf("GPU %d Clock (MHz)", gpuIdx),
|
||||
}
|
||||
axisLabel := [4]string{"Temperature (°C)", "GPU Usage (%)", "Power (W)", "Clock (MHz)"}
|
||||
|
||||
// Extract series
|
||||
t := make([]float64, len(rows))
|
||||
vals := [4][]float64{}
|
||||
for i := range vals {
|
||||
vals[i] = make([]float64, len(rows))
|
||||
}
|
||||
for i, r := range rows {
|
||||
t[i] = r.ElapsedSec
|
||||
vals[0][i] = r.TempC
|
||||
vals[1][i] = r.UsagePct
|
||||
vals[2][i] = r.PowerW
|
||||
vals[3][i] = r.ClockMHz
|
||||
}
|
||||
|
||||
tMin, tMax := gpuMinMax(t)
|
||||
type axisScale struct {
|
||||
ticks []float64
|
||||
min, max float64
|
||||
}
|
||||
var axes [4]axisScale
|
||||
for i := 0; i < 4; i++ {
|
||||
mn, mx := gpuMinMax(vals[i])
|
||||
tks := gpuNiceTicks(mn, mx, 8)
|
||||
axes[i] = axisScale{ticks: tks, min: tks[0], max: tks[len(tks)-1]}
|
||||
}
|
||||
|
||||
xv := func(tv float64) float64 {
|
||||
if tMax == tMin {
|
||||
return float64(plotX1)
|
||||
}
|
||||
return float64(plotX1) + (tv-tMin)/(tMax-tMin)*float64(PW)
|
||||
}
|
||||
yv := func(v float64, ai int) float64 {
|
||||
a := axes[ai]
|
||||
if a.max == a.min {
|
||||
return float64(plotY1 + PH/2)
|
||||
}
|
||||
return float64(plotY2) - (v-a.min)/(a.max-a.min)*float64(PH)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
fmt.Fprintf(&b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d"`+
|
||||
` style="background:#fff;border-radius:8px;display:block;margin:0 auto 24px;`+
|
||||
`box-shadow:0 2px 12px rgba(0,0,0,.12)">`+"\n", W, H)
|
||||
|
||||
// Title
|
||||
fmt.Fprintf(&b, `<text x="%d" y="22" text-anchor="middle" font-family="sans-serif"`+
|
||||
` font-size="14" font-weight="bold" fill="#333">GPU Stress Test Metrics — GPU %d</text>`+"\n",
|
||||
plotX1+PW/2, gpuIdx)
|
||||
|
||||
// Horizontal grid (align to temp axis ticks)
|
||||
b.WriteString(`<g stroke="#e0e0e0" stroke-width="0.5">` + "\n")
|
||||
for _, tick := range axes[0].ticks {
|
||||
y := yv(tick, 0)
|
||||
if y < float64(plotY1) || y > float64(plotY2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||
plotX1, y, plotX2, y)
|
||||
}
|
||||
// Vertical grid
|
||||
xTicks := gpuNiceTicks(tMin, tMax, 10)
|
||||
for _, tv := range xTicks {
|
||||
x := xv(tv)
|
||||
if x < float64(plotX1) || x > float64(plotX2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||
x, plotY1, x, plotY2)
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Chart border
|
||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
plotX1, plotY1, PW, PH)
|
||||
|
||||
// X axis ticks and labels
|
||||
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#333" text-anchor="middle">` + "\n")
|
||||
for _, tv := range xTicks {
|
||||
x := xv(tv)
|
||||
if x < float64(plotX1) || x > float64(plotX2) {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, plotY2+18, gpuFormatTick(tv))
|
||||
fmt.Fprintf(&b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
x, plotY2, x, plotY2+4)
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="13"`+
|
||||
` fill="#333" text-anchor="middle">Time (seconds)</text>`+"\n",
|
||||
plotX1+PW/2, plotY2+38)
|
||||
|
||||
// Y axes: [tempAxisX, plotX1, plotX2, clockAxisX]
|
||||
axisLineX := [4]int{tempAxisX, plotX1, plotX2, clockAxisX}
|
||||
axisRight := [4]bool{false, false, true, true}
|
||||
// Label x positions (for rotated vertical text)
|
||||
axisLabelX := [4]int{10, 68, 868, 950}
|
||||
|
||||
for i := 0; i < 4; i++ {
|
||||
ax := axisLineX[i]
|
||||
right := axisRight[i]
|
||||
color := colors[i]
|
||||
|
||||
// Axis line
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||
ax, plotY1, ax, plotY2, color)
|
||||
|
||||
// Ticks and tick labels
|
||||
fmt.Fprintf(&b, `<g font-family="sans-serif" font-size="10" fill="%s">`+"\n", color)
|
||||
for _, tick := range axes[i].ticks {
|
||||
y := yv(tick, i)
|
||||
if y < float64(plotY1) || y > float64(plotY2) {
|
||||
continue
|
||||
}
|
||||
dx := -5
|
||||
textX := ax - 8
|
||||
anchor := "end"
|
||||
if right {
|
||||
dx = 5
|
||||
textX = ax + 8
|
||||
anchor = "start"
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"`+
|
||||
` stroke="%s" stroke-width="1"/>`+"\n",
|
||||
ax, y, ax+dx, y, color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="%s" dy="4">%s</text>`+"\n",
|
||||
textX, y, anchor, gpuFormatTick(tick))
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Axis label (rotated)
|
||||
lx := axisLabelX[i]
|
||||
fmt.Fprintf(&b, `<text transform="translate(%d,%d) rotate(-90)"`+
|
||||
` font-family="sans-serif" font-size="12" fill="%s" text-anchor="middle">%s</text>`+"\n",
|
||||
lx, plotY1+PH/2, color, axisLabel[i])
|
||||
}
|
||||
|
||||
// Data lines
|
||||
for i := 0; i < 4; i++ {
|
||||
var pts strings.Builder
|
||||
for j := range rows {
|
||||
x := xv(t[j])
|
||||
y := yv(vals[i][j], i)
|
||||
if j == 0 {
|
||||
fmt.Fprintf(&pts, "%.1f,%.1f", x, y)
|
||||
} else {
|
||||
fmt.Fprintf(&pts, " %.1f,%.1f", x, y)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="1.5"/>`+"\n",
|
||||
pts.String(), colors[i])
|
||||
}
|
||||
|
||||
// Legend
|
||||
const legendY = 42
|
||||
for i := 0; i < 4; i++ {
|
||||
lx := plotX1 + i*(PW/4) + 10
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d"`+
|
||||
` stroke="%s" stroke-width="2"/>`+"\n",
|
||||
lx, legendY, lx+20, legendY, colors[i])
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" font-family="sans-serif" font-size="12" fill="#333">%s</text>`+"\n",
|
||||
lx+25, legendY+4, seriesLabel[i])
|
||||
}
|
||||
|
||||
b.WriteString("</svg>\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const (
|
||||
ansiRed = "\033[31m"
|
||||
ansiBlue = "\033[34m"
|
||||
ansiGreen = "\033[32m"
|
||||
ansiYellow = "\033[33m"
|
||||
ansiReset = "\033[0m"
|
||||
)
|
||||
|
||||
const (
|
||||
termChartWidth = 70
|
||||
termChartHeight = 12
|
||||
)
|
||||
|
||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||
// Used in SAT stress-test logs.
|
||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
type seriesDef struct {
|
||||
caption string
|
||||
color string
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
gr := gpuMap[gpuIdx]
|
||||
if len(gr) == 0 {
|
||||
continue
|
||||
}
|
||||
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
||||
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
||||
for _, d := range defs {
|
||||
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
||||
termChartHeight, termChartWidth))
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||
if len(vals) == 0 {
|
||||
return caption + "\n"
|
||||
}
|
||||
|
||||
mn, mx := gpuMinMax(vals)
|
||||
if mn == mx {
|
||||
mx = mn + 1
|
||||
}
|
||||
|
||||
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
||||
w := width
|
||||
if len(vals) < w {
|
||||
w = len(vals)
|
||||
}
|
||||
data := gpuDownsample(vals, w)
|
||||
|
||||
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
||||
row := make([]int, w)
|
||||
for i, v := range data {
|
||||
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
||||
if r < 0 {
|
||||
r = 0
|
||||
}
|
||||
if r > height {
|
||||
r = height
|
||||
}
|
||||
row[i] = r
|
||||
}
|
||||
|
||||
// Fill the character grid.
|
||||
grid := make([][]rune, height+1)
|
||||
for i := range grid {
|
||||
grid[i] = make([]rune, w)
|
||||
for j := range grid[i] {
|
||||
grid[i][j] = ' '
|
||||
}
|
||||
}
|
||||
for x := 0; x < w; x++ {
|
||||
r := row[x]
|
||||
if x == 0 {
|
||||
grid[r][0] = '─'
|
||||
continue
|
||||
}
|
||||
p := row[x-1]
|
||||
switch {
|
||||
case r == p:
|
||||
grid[r][x] = '─'
|
||||
case r < p: // value went up (row index decreased toward top)
|
||||
grid[r][x] = '╭'
|
||||
grid[p][x] = '╯'
|
||||
for y := r + 1; y < p; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
default: // r > p, value went down
|
||||
grid[p][x] = '╮'
|
||||
grid[r][x] = '╰'
|
||||
for y := p + 1; y < r; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Y axis tick labels.
|
||||
ticks := gpuNiceTicks(mn, mx, height/2)
|
||||
tickAtRow := make(map[int]string)
|
||||
labelWidth := 4
|
||||
for _, t := range ticks {
|
||||
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
||||
if r < 0 || r > height {
|
||||
continue
|
||||
}
|
||||
s := gpuFormatTick(t)
|
||||
tickAtRow[r] = s
|
||||
if len(s) > labelWidth {
|
||||
labelWidth = len(s)
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for r := 0; r <= height; r++ {
|
||||
label := tickAtRow[r]
|
||||
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
||||
switch {
|
||||
case label != "":
|
||||
b.WriteRune('┤')
|
||||
case r == height:
|
||||
b.WriteRune('┼')
|
||||
default:
|
||||
b.WriteRune('│')
|
||||
}
|
||||
b.WriteString(color)
|
||||
b.WriteString(string(grid[r]))
|
||||
b.WriteString(ansiReset)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
// Bottom axis.
|
||||
b.WriteString(strings.Repeat(" ", labelWidth))
|
||||
b.WriteRune('└')
|
||||
b.WriteString(strings.Repeat("─", w))
|
||||
b.WriteRune('\n')
|
||||
|
||||
// Caption centered under the chart.
|
||||
if caption != "" {
|
||||
total := labelWidth + 1 + w
|
||||
if pad := (total - len(caption)) / 2; pad > 0 {
|
||||
b.WriteString(strings.Repeat(" ", pad))
|
||||
}
|
||||
b.WriteString(caption)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
||||
v := make([]float64, len(rows))
|
||||
for i, r := range rows {
|
||||
v[i] = fn(r)
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
||||
func gpuDownsample(vals []float64, w int) []float64 {
|
||||
n := len(vals)
|
||||
if n == 0 {
|
||||
return make([]float64, w)
|
||||
}
|
||||
result := make([]float64, w)
|
||||
if n >= w {
|
||||
counts := make([]int, w)
|
||||
for i, v := range vals {
|
||||
bucket := i * w / n
|
||||
if bucket >= w {
|
||||
bucket = w - 1
|
||||
}
|
||||
result[bucket] += v
|
||||
counts[bucket]++
|
||||
}
|
||||
for i := range result {
|
||||
if counts[i] > 0 {
|
||||
result[i] /= float64(counts[i])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Nearest-neighbour upsample.
|
||||
for i := range result {
|
||||
src := i * (n - 1) / (w - 1)
|
||||
if src >= n {
|
||||
src = n - 1
|
||||
}
|
||||
result[i] = vals[src]
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func gpuMinMax(vals []float64) (float64, float64) {
|
||||
if len(vals) == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
mn, mx := vals[0], vals[0]
|
||||
for _, v := range vals[1:] {
|
||||
if v < mn {
|
||||
mn = v
|
||||
}
|
||||
if v > mx {
|
||||
mx = v
|
||||
}
|
||||
}
|
||||
return mn, mx
|
||||
}
|
||||
|
||||
func gpuNiceTicks(mn, mx float64, targetCount int) []float64 {
|
||||
if mn == mx {
|
||||
mn -= 1
|
||||
mx += 1
|
||||
}
|
||||
r := mx - mn
|
||||
step := math.Pow(10, math.Floor(math.Log10(r/float64(targetCount))))
|
||||
for _, f := range []float64{1, 2, 5, 10} {
|
||||
if r/(f*step) <= float64(targetCount)*1.5 {
|
||||
step = f * step
|
||||
break
|
||||
}
|
||||
}
|
||||
lo := math.Floor(mn/step) * step
|
||||
hi := math.Ceil(mx/step) * step
|
||||
var ticks []float64
|
||||
for v := lo; v <= hi+step*0.001; v += step {
|
||||
ticks = append(ticks, math.Round(v*1e9)/1e9)
|
||||
}
|
||||
return ticks
|
||||
}
|
||||
|
||||
func gpuFormatTick(v float64) string {
|
||||
if v == math.Trunc(v) {
|
||||
return strconv.Itoa(int(v))
|
||||
}
|
||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||
}
|
||||
214
audit/internal/platform/install.go
Normal file
214
audit/internal/platform/install.go
Normal file
@@ -0,0 +1,214 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
||||
|
||||
// ListInstallDisks returns block devices suitable for installation.
|
||||
// Excludes the current live boot medium but includes USB drives.
|
||||
func (s *System) ListInstallDisks() ([]InstallDisk, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,MODEL,SIZE,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("lsblk: %w", err)
|
||||
}
|
||||
|
||||
bootDev := findLiveBootDevice()
|
||||
|
||||
var disks []InstallDisk
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
// NAME MODEL SIZE TYPE TRAN — model may have spaces so we parse from end
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
// Last field: TRAN, second-to-last: TYPE, third-to-last: SIZE
|
||||
typ := fields[len(fields)-2]
|
||||
size := fields[len(fields)-3]
|
||||
name := fields[0]
|
||||
model := strings.Join(fields[1:len(fields)-3], " ")
|
||||
|
||||
if typ != "disk" {
|
||||
continue
|
||||
}
|
||||
|
||||
device := "/dev/" + name
|
||||
if device == bootDev {
|
||||
continue
|
||||
}
|
||||
|
||||
sizeBytes := diskSizeBytes(device)
|
||||
mounted := mountedParts(device)
|
||||
|
||||
disks = append(disks, InstallDisk{
|
||||
Device: device,
|
||||
Model: strings.TrimSpace(model),
|
||||
Size: size,
|
||||
SizeBytes: sizeBytes,
|
||||
MountedParts: mounted,
|
||||
})
|
||||
}
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
// diskSizeBytes returns the byte size of a block device using lsblk.
|
||||
func diskSizeBytes(device string) int64 {
|
||||
out, err := exec.Command("lsblk", "-bdn", "-o", "SIZE", device).Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
n, _ := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64)
|
||||
return n
|
||||
}
|
||||
|
||||
// mountedParts returns a list of "<part> at <mountpoint>" strings for any
|
||||
// mounted partitions on the given device.
|
||||
func mountedParts(device string) []string {
|
||||
out, err := exec.Command("lsblk", "-n", "-o", "NAME,MOUNTPOINT", device).Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var result []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
mp := fields[1]
|
||||
if mp == "" || mp == "[SWAP]" {
|
||||
continue
|
||||
}
|
||||
result = append(result, "/dev/"+strings.TrimLeft(fields[0], "└─├─")+" at "+mp)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// findLiveBootDevice returns the block device backing /run/live/medium (if any).
|
||||
func findLiveBootDevice() string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
src := strings.TrimSpace(string(out))
|
||||
if src == "" {
|
||||
return ""
|
||||
}
|
||||
// Strip partition suffix to get the whole disk device.
|
||||
// e.g. /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||
out2, err := exec.Command("lsblk", "-no", "PKNAME", src).Output()
|
||||
if err != nil || strings.TrimSpace(string(out2)) == "" {
|
||||
return src
|
||||
}
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
func MinInstallBytes() int64 {
|
||||
fi, err := os.Stat(squashfsPath)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return fi.Size() * 3 / 2
|
||||
}
|
||||
|
||||
// toramActive returns true when the live system was booted with toram.
|
||||
func toramActive() bool {
|
||||
data, err := os.ReadFile("/proc/cmdline")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.Contains(string(data), "toram")
|
||||
}
|
||||
|
||||
// freeMemBytes returns MemAvailable from /proc/meminfo.
|
||||
func freeMemBytes() int64 {
|
||||
data, err := os.ReadFile("/proc/meminfo")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
if strings.HasPrefix(line, "MemAvailable:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
n, _ := strconv.ParseInt(fields[1], 10, 64)
|
||||
return n * 1024 // kB → bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// DiskWarnings returns advisory warning strings for a disk candidate.
|
||||
func DiskWarnings(d InstallDisk) []string {
|
||||
var w []string
|
||||
if len(d.MountedParts) > 0 {
|
||||
w = append(w, "has mounted partitions: "+strings.Join(d.MountedParts, ", "))
|
||||
}
|
||||
min := MinInstallBytes()
|
||||
if min > 0 && d.SizeBytes > 0 && d.SizeBytes < min {
|
||||
w = append(w, fmt.Sprintf("disk may be too small (need ≥ %s, have %s)",
|
||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||
}
|
||||
if toramActive() {
|
||||
sqFi, err := os.Stat(squashfsPath)
|
||||
if err == nil {
|
||||
free := freeMemBytes()
|
||||
if free > 0 && free < sqFi.Size()*2 {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
}
|
||||
}
|
||||
return w
|
||||
}
|
||||
|
||||
func humanBytes(b int64) string {
|
||||
const unit = 1024
|
||||
if b < unit {
|
||||
return fmt.Sprintf("%d B", b)
|
||||
}
|
||||
div, exp := int64(unit), 0
|
||||
for n := b / unit; n >= unit; n /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
// InstallToDisk runs bee-install <device> <logfile> and streams output to logFile.
|
||||
// The context can be used to cancel.
|
||||
func (s *System) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
cmd := exec.CommandContext(ctx, "bee-install", device, logFile)
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// InstallLogPath returns the default install log path for a given device.
|
||||
func InstallLogPath(device string) string {
|
||||
safe := strings.NewReplacer("/", "_", " ", "_").Replace(device)
|
||||
return "/tmp/bee-install" + safe + ".log"
|
||||
}
|
||||
|
||||
// Label returns a display label for a disk.
|
||||
func (d InstallDisk) Label() string {
|
||||
model := d.Model
|
||||
if model == "" {
|
||||
model = "Unknown"
|
||||
}
|
||||
return fmt.Sprintf("%s %s %s", d.Device, d.Size, model)
|
||||
}
|
||||
191
audit/internal/platform/install_to_ram.go
Normal file
191
audit/internal/platform/install_to_ram.go
Normal file
@@ -0,0 +1,191 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
log := func(msg string) {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
}
|
||||
|
||||
if s.IsLiveMediaInRAM() {
|
||||
log("Already running from RAM — installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
if err != nil || len(squashfsFiles) == 0 {
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
||||
}
|
||||
|
||||
free := freeMemBytes()
|
||||
var needed int64
|
||||
for _, sf := range squashfsFiles {
|
||||
fi, err2 := os.Stat(sf)
|
||||
if err2 != nil {
|
||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||
}
|
||||
needed += fi.Size()
|
||||
}
|
||||
const headroom = 256 * 1024 * 1024
|
||||
if free > 0 && needed+headroom > free {
|
||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||
humanBytes(needed+headroom), humanBytes(free))
|
||||
}
|
||||
|
||||
dstDir := "/dev/shm/bee-live"
|
||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||
}
|
||||
|
||||
for _, sf := range squashfsFiles {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(sf)
|
||||
dst := filepath.Join(dstDir, base)
|
||||
log(fmt.Sprintf("Copying %s to RAM...", base))
|
||||
if err := copyFileLarge(ctx, sf, dst, log); err != nil {
|
||||
return fmt.Errorf("copy %s: %v", base, err)
|
||||
}
|
||||
log(fmt.Sprintf("Copied %s.", base))
|
||||
|
||||
loopDev, err := findLoopForFile(sf)
|
||||
if err != nil {
|
||||
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, err))
|
||||
continue
|
||||
}
|
||||
if err := reassociateLoopDevice(loopDev, dst); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, err))
|
||||
} else {
|
||||
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||
}
|
||||
}
|
||||
|
||||
log("Copying remaining medium files...")
|
||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||
}
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||
}
|
||||
|
||||
log("Done. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
fi, err := in.Stat()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
total := fi.Size()
|
||||
var copied int64
|
||||
buf := make([]byte, 4*1024*1024)
|
||||
for {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
n, err := in.Read(buf)
|
||||
if n > 0 {
|
||||
if _, werr := out.Write(buf[:n]); werr != nil {
|
||||
return werr
|
||||
}
|
||||
copied += int64(n)
|
||||
if logFunc != nil && total > 0 {
|
||||
pct := int(float64(copied) / float64(total) * 100)
|
||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||
}
|
||||
}
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return out.Sync()
|
||||
}
|
||||
|
||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rel, _ := filepath.Rel(src, path)
|
||||
target := filepath.Join(dst, rel)
|
||||
if fi.IsDir() {
|
||||
return os.MkdirAll(target, fi.Mode())
|
||||
}
|
||||
if strings.HasSuffix(path, ".squashfs") {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(target); err == nil {
|
||||
return nil
|
||||
}
|
||||
return copyFileLarge(ctx, path, target, nil)
|
||||
})
|
||||
}
|
||||
|
||||
func findLoopForFile(backingFile string) (string, error) {
|
||||
out, err := exec.Command("losetup", "--list", "--json").Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Name string `json:"name"`
|
||||
BackFile string `json:"back-file"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil {
|
||||
return "", err
|
||||
}
|
||||
for _, dev := range result.Loopdevices {
|
||||
if dev.BackFile == backingFile {
|
||||
return dev.Name, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||
}
|
||||
|
||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||
return nil
|
||||
}
|
||||
return loopChangeFD(loopDev, newFile)
|
||||
}
|
||||
28
audit/internal/platform/install_to_ram_linux.go
Normal file
28
audit/internal/platform/install_to_ram_linux.go
Normal file
@@ -0,0 +1,28 @@
|
||||
//go:build linux
|
||||
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
const ioctlLoopChangeFD = 0x4C08
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
lf, err := os.OpenFile(loopDev, os.O_RDWR, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer lf.Close()
|
||||
nf, err := os.OpenFile(newFile, os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer nf.Close()
|
||||
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, lf.Fd(), ioctlLoopChangeFD, nf.Fd())
|
||||
if errno != 0 {
|
||||
return errno
|
||||
}
|
||||
return nil
|
||||
}
|
||||
9
audit/internal/platform/install_to_ram_other.go
Normal file
9
audit/internal/platform/install_to_ram_other.go
Normal file
@@ -0,0 +1,9 @@
|
||||
//go:build !linux
|
||||
|
||||
package platform
|
||||
|
||||
import "errors"
|
||||
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||
}
|
||||
326
audit/internal/platform/live_metrics.go
Normal file
326
audit/internal/platform/live_metrics.go
Normal file
@@ -0,0 +1,326 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
type TempReading struct {
|
||||
Name string `json:"name"`
|
||||
Group string `json:"group,omitempty"`
|
||||
Celsius float64 `json:"celsius"`
|
||||
}
|
||||
|
||||
// SampleLiveMetrics collects a single metrics snapshot from all available
|
||||
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
|
||||
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
|
||||
func SampleLiveMetrics() LiveMetricSample {
|
||||
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
||||
|
||||
// GPU metrics — try NVIDIA first, fall back to AMD
|
||||
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
|
||||
s.GPUs = gpus
|
||||
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
|
||||
s.GPUs = amdGPUs
|
||||
}
|
||||
|
||||
// Fan speeds — skipped silently if ipmitool unavailable
|
||||
fans, _ := sampleFanSpeeds()
|
||||
s.Fans = fans
|
||||
|
||||
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
|
||||
if !hasTempGroup(s.Temps, "cpu") {
|
||||
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
|
||||
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
|
||||
}
|
||||
}
|
||||
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
|
||||
// Memory load — from /proc/meminfo
|
||||
s.MemLoadPct = sampleMemLoadPct()
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||
// the overall CPU utilisation percentage.
|
||||
var cpuStatPrev [2]uint64 // [total, idle]
|
||||
|
||||
func sampleCPULoadPct() float64 {
|
||||
total, idle := readCPUStat()
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
||||
cpuStatPrev = [2]uint64{total, idle}
|
||||
if prevTotal == 0 {
|
||||
return 0
|
||||
}
|
||||
dt := float64(total - prevTotal)
|
||||
di := float64(idle - prevIdle)
|
||||
if dt <= 0 {
|
||||
return 0
|
||||
}
|
||||
pct := (1 - di/dt) * 100
|
||||
if pct < 0 {
|
||||
return 0
|
||||
}
|
||||
if pct > 100 {
|
||||
return 100
|
||||
}
|
||||
return pct
|
||||
}
|
||||
|
||||
func readCPUStat() (total, idle uint64) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
defer f.Close()
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if !strings.HasPrefix(line, "cpu ") {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)[1:] // skip "cpu"
|
||||
var vals [10]uint64
|
||||
for i := 0; i < len(fields) && i < 10; i++ {
|
||||
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
|
||||
}
|
||||
// idle = idle + iowait
|
||||
idle = vals[3] + vals[4]
|
||||
for _, v := range vals {
|
||||
total += v
|
||||
}
|
||||
return total, idle
|
||||
}
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
func sampleMemLoadPct() float64 {
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
defer f.Close()
|
||||
vals := map[string]uint64{}
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
fields := strings.Fields(sc.Text())
|
||||
if len(fields) >= 2 {
|
||||
v, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
vals[strings.TrimSuffix(fields[0], ":")] = v
|
||||
}
|
||||
}
|
||||
total := vals["MemTotal"]
|
||||
avail := vals["MemAvailable"]
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
used := total - avail
|
||||
return float64(used) / float64(total) * 100
|
||||
}
|
||||
|
||||
func hasTempGroup(temps []TempReading, group string) bool {
|
||||
for _, t := range temps {
|
||||
if t.Group == group {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func sampleLiveTemperatureReadings() []TempReading {
|
||||
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
|
||||
return temps
|
||||
}
|
||||
return sampleLiveTempsViaIPMI()
|
||||
}
|
||||
|
||||
func sampleLiveTempsViaSensorsJSON() []TempReading {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(out, &doc); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
chips := make([]string, 0, len(doc))
|
||||
for chip := range doc {
|
||||
chips = append(chips, chip)
|
||||
}
|
||||
sort.Strings(chips)
|
||||
|
||||
temps := make([]TempReading, 0, len(chips))
|
||||
seen := map[string]struct{}{}
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
featureNames := make([]string, 0, len(features))
|
||||
for name := range features {
|
||||
featureNames = append(featureNames, name)
|
||||
}
|
||||
sort.Strings(featureNames)
|
||||
for _, name := range featureNames {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
feature, ok := features[name].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
value, ok := firstTempInputValue(feature)
|
||||
if !ok || value <= 0 || value > 150 {
|
||||
continue
|
||||
}
|
||||
group := classifyLiveTempGroup(chip, name)
|
||||
if group == "gpu" {
|
||||
continue
|
||||
}
|
||||
label := strings.TrimSpace(name)
|
||||
if label == "" {
|
||||
continue
|
||||
}
|
||||
if group == "ambient" {
|
||||
label = compactAmbientTempName(chip, label)
|
||||
}
|
||||
key := group + "\x00" + label
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||
}
|
||||
}
|
||||
return temps
|
||||
}
|
||||
|
||||
func sampleLiveTempsViaIPMI() []TempReading {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
var temps []TempReading
|
||||
seen := map[string]struct{}{}
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
unit := strings.ToLower(strings.TrimSpace(parts[2]))
|
||||
if !strings.Contains(unit, "degrees") {
|
||||
continue
|
||||
}
|
||||
raw := strings.TrimSpace(parts[1])
|
||||
if raw == "" || strings.EqualFold(raw, "na") {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.ParseFloat(raw, 64)
|
||||
if err != nil || value <= 0 || value > 150 {
|
||||
continue
|
||||
}
|
||||
group := classifyLiveTempGroup("", name)
|
||||
if group == "gpu" {
|
||||
continue
|
||||
}
|
||||
label := name
|
||||
if group == "ambient" {
|
||||
label = compactAmbientTempName("", label)
|
||||
}
|
||||
key := group + "\x00" + label
|
||||
if _, ok := seen[key]; ok {
|
||||
continue
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
||||
}
|
||||
return temps
|
||||
}
|
||||
|
||||
func firstTempInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
|
||||
continue
|
||||
}
|
||||
switch value := feature[key].(type) {
|
||||
case float64:
|
||||
return value, true
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(value, 64)
|
||||
if err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func classifyLiveTempGroup(chip, name string) string {
|
||||
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
|
||||
switch {
|
||||
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
|
||||
return "gpu"
|
||||
case strings.Contains(text, "coretemp"),
|
||||
strings.Contains(text, "k10temp"),
|
||||
strings.Contains(text, "zenpower"),
|
||||
strings.Contains(text, "package id"),
|
||||
strings.Contains(text, "x86_pkg_temp"),
|
||||
strings.Contains(text, "tctl"),
|
||||
strings.Contains(text, "tdie"),
|
||||
strings.Contains(text, "tccd"),
|
||||
strings.Contains(text, "cpu"),
|
||||
strings.Contains(text, "peci"):
|
||||
return "cpu"
|
||||
default:
|
||||
return "ambient"
|
||||
}
|
||||
}
|
||||
|
||||
func compactAmbientTempName(chip, name string) string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
name = strings.TrimSpace(name)
|
||||
if chip == "" || strings.EqualFold(chip, name) {
|
||||
return name
|
||||
}
|
||||
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
|
||||
return name
|
||||
}
|
||||
return chip + " / " + name
|
||||
}
|
||||
44
audit/internal/platform/live_metrics_test.go
Normal file
44
audit/internal/platform/live_metrics_test.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestFirstTempInputValue(t *testing.T) {
|
||||
feature := map[string]any{
|
||||
"temp1_input": 61.5,
|
||||
"temp1_max": 80.0,
|
||||
}
|
||||
got, ok := firstTempInputValue(feature)
|
||||
if !ok {
|
||||
t.Fatal("expected value")
|
||||
}
|
||||
if got != 61.5 {
|
||||
t.Fatalf("got %v want 61.5", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifyLiveTempGroup(t *testing.T) {
|
||||
tests := []struct {
|
||||
chip string
|
||||
name string
|
||||
want string
|
||||
}{
|
||||
{chip: "coretemp-isa-0000", name: "Package id 0", want: "cpu"},
|
||||
{chip: "amdgpu-pci-4300", name: "edge", want: "gpu"},
|
||||
{chip: "nvme-pci-0100", name: "Composite", want: "ambient"},
|
||||
{chip: "acpitz-acpi-0", name: "temp1", want: "ambient"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := classifyLiveTempGroup(tc.chip, tc.name); got != tc.want {
|
||||
t.Fatalf("classifyLiveTempGroup(%q,%q)=%q want %q", tc.chip, tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCompactAmbientTempName(t *testing.T) {
|
||||
if got := compactAmbientTempName("nvme-pci-0100", "Composite"); got != "nvme-pci-0100 / Composite" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
if got := compactAmbientTempName("", "Inlet Temp"); got != "Inlet Temp" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package platform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -18,21 +19,17 @@ func (s *System) ListInterfaces() ([]InterfaceInfo, error) {
|
||||
out := make([]InterfaceInfo, 0, len(names))
|
||||
for _, name := range names {
|
||||
state := "unknown"
|
||||
if raw, err := exec.Command("ip", "-o", "link", "show", name).Output(); err == nil {
|
||||
fields := strings.Fields(string(raw))
|
||||
if len(fields) >= 9 {
|
||||
state = fields[8]
|
||||
if up, err := interfaceAdminState(name); err == nil {
|
||||
if up {
|
||||
state = "up"
|
||||
} else {
|
||||
state = "down"
|
||||
}
|
||||
}
|
||||
|
||||
var ipv4 []string
|
||||
if raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", name).Output(); err == nil {
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 4 {
|
||||
ipv4 = append(ipv4, fields[3])
|
||||
}
|
||||
}
|
||||
ipv4, err := interfaceIPv4Addrs(name)
|
||||
if err != nil {
|
||||
ipv4 = nil
|
||||
}
|
||||
|
||||
out = append(out, InterfaceInfo{Name: name, State: state, IPv4: ipv4})
|
||||
@@ -55,6 +52,119 @@ func (s *System) DefaultRoute() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) CaptureNetworkSnapshot() (NetworkSnapshot, error) {
|
||||
names, err := listInterfaceNames()
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
|
||||
snapshot := NetworkSnapshot{
|
||||
Interfaces: make([]NetworkInterfaceSnapshot, 0, len(names)),
|
||||
}
|
||||
for _, name := range names {
|
||||
up, err := interfaceAdminState(name)
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
ipv4, err := interfaceIPv4Addrs(name)
|
||||
if err != nil {
|
||||
return NetworkSnapshot{}, err
|
||||
}
|
||||
snapshot.Interfaces = append(snapshot.Interfaces, NetworkInterfaceSnapshot{
|
||||
Name: name,
|
||||
Up: up,
|
||||
IPv4: ipv4,
|
||||
})
|
||||
}
|
||||
|
||||
if raw, err := exec.Command("ip", "route", "show", "default").Output(); err == nil {
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
snapshot.DefaultRoutes = append(snapshot.DefaultRoutes, line)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if raw, err := os.ReadFile("/etc/resolv.conf"); err == nil {
|
||||
snapshot.ResolvConf = string(raw)
|
||||
}
|
||||
|
||||
return snapshot, nil
|
||||
}
|
||||
|
||||
func (s *System) RestoreNetworkSnapshot(snapshot NetworkSnapshot) error {
|
||||
var errs []string
|
||||
|
||||
for _, iface := range snapshot.Interfaces {
|
||||
if err := exec.Command("ip", "link", "set", "dev", iface.Name, "up").Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: bring up before restore: %v", iface.Name, err))
|
||||
continue
|
||||
}
|
||||
if err := exec.Command("ip", "addr", "flush", "dev", iface.Name).Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: flush addresses: %v", iface.Name, err))
|
||||
}
|
||||
for _, cidr := range iface.IPv4 {
|
||||
if raw, err := exec.Command("ip", "addr", "add", cidr, "dev", iface.Name).CombinedOutput(); err != nil {
|
||||
detail := strings.TrimSpace(string(raw))
|
||||
if detail != "" {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v: %s", iface.Name, cidr, err, detail))
|
||||
} else {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore address %s: %v", iface.Name, cidr, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
state := "down"
|
||||
if iface.Up {
|
||||
state = "up"
|
||||
}
|
||||
if err := exec.Command("ip", "link", "set", "dev", iface.Name, state).Run(); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("%s: restore state %s: %v", iface.Name, state, err))
|
||||
}
|
||||
}
|
||||
|
||||
if err := exec.Command("ip", "route", "del", "default").Run(); err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
if !errors.As(err, &exitErr) {
|
||||
errs = append(errs, fmt.Sprintf("clear default route: %v", err))
|
||||
}
|
||||
}
|
||||
for _, route := range snapshot.DefaultRoutes {
|
||||
fields := strings.Fields(route)
|
||||
if len(fields) == 0 {
|
||||
continue
|
||||
}
|
||||
// Strip state flags that ip-route(8) does not accept as add arguments.
|
||||
filtered := fields[:0]
|
||||
for _, f := range fields {
|
||||
switch f {
|
||||
case "linkdown", "dead", "onlink", "pervasive":
|
||||
// skip
|
||||
default:
|
||||
filtered = append(filtered, f)
|
||||
}
|
||||
}
|
||||
args := append([]string{"route", "add"}, filtered...)
|
||||
if raw, err := exec.Command("ip", args...).CombinedOutput(); err != nil {
|
||||
detail := strings.TrimSpace(string(raw))
|
||||
if detail != "" {
|
||||
errs = append(errs, fmt.Sprintf("restore route %q: %v: %s", route, err, detail))
|
||||
} else {
|
||||
errs = append(errs, fmt.Sprintf("restore route %q: %v", route, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.WriteFile("/etc/resolv.conf", []byte(snapshot.ResolvConf), 0644); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("restore resolv.conf: %v", err))
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return errors.New(strings.Join(errs, "; "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *System) DHCPOne(iface string) (string, error) {
|
||||
var out bytes.Buffer
|
||||
if err := exec.Command("ip", "link", "set", iface, "up").Run(); err != nil {
|
||||
@@ -131,6 +241,65 @@ func (s *System) SetStaticIPv4(cfg StaticIPv4Config) (string, error) {
|
||||
return out.String(), nil
|
||||
}
|
||||
|
||||
// SetInterfaceState brings a network interface up or down.
|
||||
func (s *System) SetInterfaceState(iface string, up bool) error {
|
||||
state := "down"
|
||||
if up {
|
||||
state = "up"
|
||||
}
|
||||
return exec.Command("ip", "link", "set", "dev", iface, state).Run()
|
||||
}
|
||||
|
||||
// GetInterfaceState returns true if the interface is UP.
|
||||
func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
return interfaceAdminState(iface)
|
||||
}
|
||||
|
||||
func interfaceAdminState(iface string) (bool, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return parseInterfaceAdminState(string(raw))
|
||||
}
|
||||
|
||||
func parseInterfaceAdminState(raw string) (bool, error) {
|
||||
start := strings.IndexByte(raw, '<')
|
||||
if start == -1 {
|
||||
return false, fmt.Errorf("ip link output missing flags")
|
||||
}
|
||||
end := strings.IndexByte(raw[start+1:], '>')
|
||||
if end == -1 {
|
||||
return false, fmt.Errorf("ip link output missing flag terminator")
|
||||
}
|
||||
flags := strings.Split(raw[start+1:start+1+end], ",")
|
||||
for _, flag := range flags {
|
||||
if strings.TrimSpace(flag) == "UP" {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||
raw, err := exec.Command("ip", "-o", "-4", "addr", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
if errors.As(err, &exitErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
var ipv4 []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 4 {
|
||||
ipv4 = append(ipv4, fields[3])
|
||||
}
|
||||
}
|
||||
return ipv4, nil
|
||||
}
|
||||
|
||||
func listInterfaceNames() ([]string, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show").Output()
|
||||
if err != nil {
|
||||
|
||||
46
audit/internal/platform/network_test.go
Normal file
46
audit/internal/platform/network_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseInterfaceAdminState(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
raw string
|
||||
want bool
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "admin up with no carrier",
|
||||
raw: "2: enp1s0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc mq state DOWN mode DEFAULT group default qlen 1000\n",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "admin down",
|
||||
raw: "2: enp1s0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000\n",
|
||||
want: false,
|
||||
},
|
||||
{
|
||||
name: "malformed output",
|
||||
raw: "2: enp1s0: mtu 1500 state DOWN\n",
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, err := parseInterfaceAdminState(tt.raw)
|
||||
if tt.wantErr {
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if got != tt.want {
|
||||
t.Fatalf("got %v want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
205
audit/internal/platform/nvidia_stress.go
Normal file
205
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,205 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
normalizeNvidiaStressOptions(&opts)
|
||||
|
||||
job, err := buildNvidiaStressJob(opts)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case NvidiaStressLoaderJohn:
|
||||
return "gpu-nvidia-john"
|
||||
case NvidiaStressLoaderNCCL:
|
||||
return "gpu-nvidia-nccl"
|
||||
default:
|
||||
return "gpu-nvidia-burn"
|
||||
}
|
||||
}
|
||||
|
||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return satJob{}, err
|
||||
}
|
||||
|
||||
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||
switch loader {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-gpu-burn.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderJohn:
|
||||
cmd := []string{
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-john-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderNCCL:
|
||||
cmd := []string{
|
||||
"bee-nccl-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-nccl-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
default:
|
||||
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
case NvidiaStressLoaderJohn:
|
||||
opts.Loader = NvidiaStressLoaderJohn
|
||||
case NvidiaStressLoaderNCCL:
|
||||
opts.Loader = NvidiaStressLoaderNCCL
|
||||
default:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
}
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
}
|
||||
|
||||
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
|
||||
selected := all
|
||||
if len(include) > 0 {
|
||||
want := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
want[idx] = struct{}{}
|
||||
}
|
||||
selected = selected[:0]
|
||||
for _, idx := range all {
|
||||
if _, ok := want[idx]; ok {
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||
}
|
||||
out := append([]int(nil), selected...)
|
||||
sort.Ints(out)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func listNvidiaGPUIndices() ([]int, error) {
|
||||
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var indices []int
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
return dedupeSortedIndices(indices), nil
|
||||
}
|
||||
|
||||
func dedupeSortedIndices(values []int) []int {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[int]struct{}, len(values))
|
||||
out := make([]int, 0, len(values))
|
||||
for _, value := range values {
|
||||
if value < 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func joinIndexList(values []int) string {
|
||||
parts := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
parts = append(parts, strconv.Itoa(value))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
528
audit/internal/platform/platform_stress.go
Normal file
528
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,528 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PlatformStressCycle defines one load+idle cycle.
|
||||
type PlatformStressCycle struct {
|
||||
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||
IdleSec int // seconds of idle monitoring after load cut
|
||||
}
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
type platformStressRow struct {
|
||||
ElapsedSec float64
|
||||
Cycle int
|
||||
Phase string // "load" | "idle"
|
||||
CPULoadPct float64
|
||||
MaxCPUTempC float64
|
||||
MaxGPUTempC float64
|
||||
SysPowerW float64
|
||||
FanMinRPM float64
|
||||
FanMaxRPM float64
|
||||
GPUThrottled bool
|
||||
}
|
||||
|
||||
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||
func (s *System) RunPlatformStress(
|
||||
ctx context.Context,
|
||||
baseDir string,
|
||||
opts PlatformStressOptions,
|
||||
logFunc func(string),
|
||||
) (string, error) {
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if len(opts.Cycles) == 0 {
|
||||
return "", fmt.Errorf("no cycles defined")
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
|
||||
stamp := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
|
||||
var analyses []cycleAnalysis
|
||||
|
||||
for i, cycle := range opts.Cycles {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
cycleNum := i + 1
|
||||
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||
|
||||
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
|
||||
// GPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
for _, r := range loadRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, loadRows...)
|
||||
loadCancel()
|
||||
wg.Wait()
|
||||
|
||||
if len(loadRows) > 0 {
|
||||
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||
}
|
||||
|
||||
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||
for _, r := range idleRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, idleRows...)
|
||||
idleCancel()
|
||||
|
||||
// Per-cycle analysis
|
||||
an := analyzePlatformCycle(loadRows, idleRows)
|
||||
analyses = append(analyses, an)
|
||||
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||
}
|
||||
|
||||
// Write CSV
|
||||
csvData := writePlatformCSV(rows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||
|
||||
// Write summary
|
||||
summary := writePlatformSummary(opts, analyses)
|
||||
logFunc("--- Summary ---")
|
||||
for _, line := range strings.Split(summary, "\n") {
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||
var rows []platformStressRow
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return rows
|
||||
case <-ticker.C:
|
||||
sample := SampleLiveMetrics()
|
||||
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||
r := platformStressRow{
|
||||
ElapsedSec: time.Since(testStart).Seconds(),
|
||||
Cycle: cycle,
|
||||
Phase: phase,
|
||||
CPULoadPct: s.CPULoadPct,
|
||||
SysPowerW: s.PowerW,
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
switch t.Group {
|
||||
case "cpu":
|
||||
if t.Celsius > r.MaxCPUTempC {
|
||||
r.MaxCPUTempC = t.Celsius
|
||||
}
|
||||
case "gpu":
|
||||
if t.Celsius > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = t.Celsius
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
if g.TempC > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = g.TempC
|
||||
}
|
||||
}
|
||||
if len(s.Fans) > 0 {
|
||||
r.FanMinRPM = s.Fans[0].RPM
|
||||
r.FanMaxRPM = s.Fans[0].RPM
|
||||
for _, f := range s.Fans[1:] {
|
||||
if f.RPM < r.FanMinRPM {
|
||||
r.FanMinRPM = f.RPM
|
||||
}
|
||||
if f.RPM > r.FanMaxRPM {
|
||||
r.FanMaxRPM = f.RPM
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func formatPlatformRow(r platformStressRow) string {
|
||||
throttle := ""
|
||||
if r.GPUThrottled {
|
||||
throttle = " THROTTLE"
|
||||
}
|
||||
fans := ""
|
||||
if r.FanMinRPM > 0 {
|
||||
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||
}
|
||||
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||
}
|
||||
|
||||
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||
var an cycleAnalysis
|
||||
for _, r := range loadRows {
|
||||
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||
an.maxCPUTemp = r.MaxCPUTempC
|
||||
}
|
||||
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||
an.maxGPUTemp = r.MaxGPUTempC
|
||||
}
|
||||
if r.SysPowerW > an.maxPower {
|
||||
an.maxPower = r.SysPowerW
|
||||
}
|
||||
if r.GPUThrottled {
|
||||
an.throttled = true
|
||||
}
|
||||
}
|
||||
// Fan RPM at cut = avg of last 5 load rows
|
||||
if n := len(loadRows); n > 0 {
|
||||
window := loadRows
|
||||
if n > 5 {
|
||||
window = loadRows[n-5:]
|
||||
}
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range window {
|
||||
if r.FanMinRPM > 0 {
|
||||
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt > 0 {
|
||||
an.fanAtCutAvg = sum / float64(cnt)
|
||||
}
|
||||
}
|
||||
// Fan RPM min in first 15s of idle
|
||||
an.fanMin15s = an.fanAtCutAvg
|
||||
var cutElapsed float64
|
||||
if len(loadRows) > 0 {
|
||||
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||
}
|
||||
for _, r := range idleRows {
|
||||
if r.ElapsedSec > cutElapsed+15 {
|
||||
break
|
||||
}
|
||||
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||
an.fanMin15s = avg
|
||||
}
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||
}
|
||||
return an
|
||||
}
|
||||
|
||||
type cycleAnalysis struct {
|
||||
maxCPUTemp float64
|
||||
maxGPUTemp float64
|
||||
maxPower float64
|
||||
throttled bool
|
||||
fanAtCutAvg float64
|
||||
fanMin15s float64
|
||||
fanDropPct float64
|
||||
}
|
||||
|
||||
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||
|
||||
totalThrottle := 0
|
||||
totalFanWarn := 0
|
||||
for i, an := range analyses {
|
||||
cycle := opts.Cycles[i]
|
||||
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||
if an.throttled {
|
||||
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||
totalThrottle++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Throttle: none\n")
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||
if an.fanDropPct > 20 {
|
||||
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||
totalFanWarn++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||
if totalThrottle > 0 {
|
||||
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||
} else if totalFanWarn > 0 {
|
||||
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||
} else {
|
||||
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||
var buf bytes.Buffer
|
||||
w := csv.NewWriter(&buf)
|
||||
_ = w.Write([]string{
|
||||
"elapsed_sec", "cycle", "phase",
|
||||
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||
})
|
||||
for _, r := range rows {
|
||||
throttled := "0"
|
||||
if r.GPUThrottled {
|
||||
throttled = "1"
|
||||
}
|
||||
_ = w.Write([]string{
|
||||
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||
strconv.Itoa(r.Cycle),
|
||||
r.Phase,
|
||||
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||
throttled,
|
||||
})
|
||||
}
|
||||
w.Flush()
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
path, err := satLookPath("stressapptest")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||
}
|
||||
// Use a very long duration; the context timeout will kill it at the right time.
|
||||
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||
if threads := platformStressCPUThreads(); threads > 0 {
|
||||
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||
}
|
||||
if mb := platformStressMemoryMB(); mb > 0 {
|
||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||
}
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := `actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: 86400000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func platformStressCPUThreads() int {
|
||||
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||
return n
|
||||
}
|
||||
cpus := runtime.NumCPU()
|
||||
switch {
|
||||
case cpus <= 2:
|
||||
return 1
|
||||
case cpus <= 8:
|
||||
return cpus - 1
|
||||
default:
|
||||
return cpus - 2
|
||||
}
|
||||
}
|
||||
|
||||
func platformStressMemoryMB() int {
|
||||
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||
return mb
|
||||
}
|
||||
free := freeMemBytes()
|
||||
if free <= 0 {
|
||||
return 0
|
||||
}
|
||||
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||
if mb < 1024 {
|
||||
return 1024
|
||||
}
|
||||
return mb
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
gz := gzip.NewWriter(f)
|
||||
defer gz.Close()
|
||||
tw := tar.NewWriter(gz)
|
||||
defer tw.Close()
|
||||
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(dir)
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
fpath := filepath.Join(dir, e.Name())
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
hdr := &tar.Header{
|
||||
Name: filepath.Join(base, e.Name()),
|
||||
Size: int64(len(data)),
|
||||
Mode: 0644,
|
||||
ModTime: time.Now(),
|
||||
}
|
||||
if err := tw.WriteHeader(hdr); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := tw.Write(data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||
if got := platformStressCPUThreads(); got != 7 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||
got := platformStressCPUThreads()
|
||||
if got < 1 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||
}
|
||||
if got > runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||
if got := platformStressMemoryMB(); got != 8192 {
|
||||
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||
}
|
||||
}
|
||||
217
audit/internal/platform/runtime.go
Normal file
217
audit/internal/platform/runtime.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
var runtimeRequiredTools = []string{
|
||||
"dmidecode",
|
||||
"lspci",
|
||||
"lsblk",
|
||||
"smartctl",
|
||||
"nvme",
|
||||
"ipmitool",
|
||||
"dhclient",
|
||||
"mount",
|
||||
}
|
||||
|
||||
var runtimeTrackedServices = []string{
|
||||
"bee-network",
|
||||
"bee-nvidia",
|
||||
"bee-preflight",
|
||||
"bee-audit",
|
||||
"bee-web",
|
||||
"bee-sshsetup",
|
||||
}
|
||||
|
||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||
checkedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
health := schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
CheckedAt: checkedAt,
|
||||
ExportDir: strings.TrimSpace(exportDir),
|
||||
}
|
||||
|
||||
if health.ExportDir != "" {
|
||||
if err := os.MkdirAll(health.ExportDir, 0755); err != nil {
|
||||
health.Status = "FAILED"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "export_dir_unavailable",
|
||||
Severity: "critical",
|
||||
Description: err.Error(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
interfaces, err := s.ListInterfaces()
|
||||
if err == nil {
|
||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||
hasIPv4 := false
|
||||
missingIPv4 := false
|
||||
for _, iface := range interfaces {
|
||||
outcome := "no_offer"
|
||||
if len(iface.IPv4) > 0 {
|
||||
outcome = "lease_acquired"
|
||||
hasIPv4 = true
|
||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||
outcome = "link_down"
|
||||
} else {
|
||||
missingIPv4 = true
|
||||
}
|
||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||
Name: iface.Name,
|
||||
State: iface.State,
|
||||
IPv4: iface.IPv4,
|
||||
Outcome: outcome,
|
||||
})
|
||||
}
|
||||
switch {
|
||||
case hasIPv4 && !missingIPv4:
|
||||
health.NetworkStatus = "OK"
|
||||
case hasIPv4:
|
||||
health.NetworkStatus = "PARTIAL"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_partial",
|
||||
Severity: "warning",
|
||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
||||
})
|
||||
default:
|
||||
health.NetworkStatus = "FAILED"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_failed",
|
||||
Severity: "warning",
|
||||
Description: "No physical interface obtained IPv4 connectivity.",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
for _, tool := range s.runtimeToolStatuses(vendor) {
|
||||
health.Tools = append(health.Tools, schema.RuntimeToolStatus{
|
||||
Name: tool.Name,
|
||||
Path: tool.Path,
|
||||
OK: tool.OK,
|
||||
})
|
||||
if !tool.OK {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "tool_missing",
|
||||
Severity: "warning",
|
||||
Description: "Required tool missing: " + tool.Name,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for _, name := range runtimeTrackedServices {
|
||||
health.Services = append(health.Services, schema.RuntimeServiceStatus{
|
||||
Name: name,
|
||||
Status: s.ServiceState(name),
|
||||
})
|
||||
}
|
||||
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
}
|
||||
return health, nil
|
||||
}
|
||||
|
||||
func commandText(name string, args ...string) string {
|
||||
raw, err := exec.Command(name, args...).CombinedOutput()
|
||||
if err != nil && len(raw) == 0 {
|
||||
return ""
|
||||
}
|
||||
return string(raw)
|
||||
}
|
||||
|
||||
func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
tools := s.CheckTools(runtimeRequiredTools)
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
"bee-nccl-gpu-stress",
|
||||
"all_reduce_perf",
|
||||
})...)
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||
tool.Path = cmd[0]
|
||||
if len(cmd) > 1 && strings.HasSuffix(cmd[1], "rocm_smi.py") {
|
||||
tool.Path = cmd[1]
|
||||
}
|
||||
tool.OK = true
|
||||
}
|
||||
tools = append(tools, tool)
|
||||
}
|
||||
return tools
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA kernel module is not loaded.",
|
||||
})
|
||||
}
|
||||
if health.DriverReady && !strings.Contains(lsmodText, "nvidia_modeset") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_modeset_failed",
|
||||
Severity: "warning",
|
||||
Description: "nvidia-modeset is not loaded; display/CUDA stack may be partial.",
|
||||
})
|
||||
}
|
||||
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "cuda_runtime_not_ready",
|
||||
Severity: "warning",
|
||||
Description: "CUDA runtime is not ready for GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
case "amd":
|
||||
health.DriverReady = strings.Contains(lsmodText, "amdgpu ") || strings.Contains(lsmodText, "amdkfd")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "amdgpu_kernel_module_missing",
|
||||
Severity: "warning",
|
||||
Description: "AMD GPU driver is not loaded.",
|
||||
})
|
||||
}
|
||||
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err == nil && strings.TrimSpace(string(out)) != "" {
|
||||
health.CUDAReady = true
|
||||
health.DriverReady = true
|
||||
return
|
||||
}
|
||||
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "rocm_smi_unavailable",
|
||||
Severity: "warning",
|
||||
Description: "ROCm SMI is not available for AMD GPU SAT.",
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -2,7 +2,11 @@ package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bufio"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
@@ -11,24 +15,341 @@ import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string) (string, error) {
|
||||
return runAcceptancePack(baseDir, "gpu-nvidia", nvidiaSATJobs())
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
|
||||
rocmSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
"/opt/rocm-*/bin/rocm-smi",
|
||||
}
|
||||
rocmSMIScriptGlobs = []string{
|
||||
"/opt/rocm/libexec/rocm_smi/rocm_smi.py",
|
||||
"/opt/rocm-*/libexec/rocm_smi/rocm_smi.py",
|
||||
}
|
||||
rvsExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rvs",
|
||||
"/opt/rocm-*/bin/rvs",
|
||||
}
|
||||
)
|
||||
|
||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||
// Returns combined stdout+stderr as a byte slice.
|
||||
func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||
pr, pw := io.Pipe()
|
||||
cmd.Stdout = pw
|
||||
cmd.Stderr = pw
|
||||
|
||||
var buf bytes.Buffer
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
scanner := bufio.NewScanner(pr)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
buf.WriteString(line + "\n")
|
||||
if logFunc != nil {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
err := cmd.Start()
|
||||
if err != nil {
|
||||
_ = pw.Close()
|
||||
wg.Wait()
|
||||
return nil, err
|
||||
}
|
||||
waitErr := cmd.Wait()
|
||||
_ = pw.Close()
|
||||
wg.Wait()
|
||||
return buf.Bytes(), waitErr
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(baseDir string) (string, error) {
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
Name string
|
||||
MemoryMB int
|
||||
}
|
||||
|
||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||
type AMDGPUInfo struct {
|
||||
Index int
|
||||
Name string
|
||||
}
|
||||
|
||||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||
func (s *System) DetectGPUVendor() string {
|
||||
if _, err := os.Stat("/dev/nvidia0"); err == nil {
|
||||
return "nvidia"
|
||||
}
|
||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||
return "amd"
|
||||
}
|
||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||
text := strings.ToLower(string(raw))
|
||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||
return "amd"
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ListAMDGPUs returns AMD GPUs visible to rocm-smi.
|
||||
func (s *System) ListAMDGPUs() ([]AMDGPUInfo, error) {
|
||||
out, err := runROCmSMI("--showproductname", "--csv")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("rocm-smi: %w", err)
|
||||
}
|
||||
var gpus []AMDGPUInfo
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(strings.ToLower(line), "device") {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, ",", 2)
|
||||
name := ""
|
||||
if len(parts) >= 2 {
|
||||
name = strings.TrimSpace(parts[1])
|
||||
}
|
||||
idx := len(gpus)
|
||||
gpus = append(gpus, AMDGPUInfo{Index: idx, Name: name})
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
// RunAMDAcceptancePack runs an AMD GPU diagnostic pack using rocm-smi.
|
||||
func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-smi-showallinfo.log", cmd: []string{"rocm-smi", "--showallinfo"}},
|
||||
{name: "03-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "04-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||
cfg := `actions:
|
||||
- name: mem_integrity
|
||||
device: all
|
||||
module: mem
|
||||
parallel: true
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size: 8640
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||
cfg := `actions:
|
||||
- name: babel_mem_bw
|
||||
device: all
|
||||
module: babel
|
||||
parallel: true
|
||||
copy_matrix: true
|
||||
target_stress: 90
|
||||
matrix_size: 134217728
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
seconds := durationSec
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_AMD_STRESS_SECONDS", 300)
|
||||
}
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||
rvsCfg := amdStressRVSConfig(seconds)
|
||||
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||
}
|
||||
|
||||
func amdStressRVSConfig(seconds int) string {
|
||||
return fmt.Sprintf(`actions:
|
||||
- name: gst_stress
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: %d
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`, seconds*1000)
|
||||
}
|
||||
|
||||
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||
return []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||
}
|
||||
}
|
||||
|
||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||
func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
out, err := exec.Command("nvidia-smi",
|
||||
"--query-gpu=index,name,memory.total",
|
||||
"--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var gpus []NvidiaGPU
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, ", ", 3)
|
||||
if len(parts) != 3 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
memMB, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
||||
gpus = append(gpus, NvidiaGPU{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
MemoryMB: memMB,
|
||||
})
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
// detect GPU count
|
||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(context.Background(), baseDir, "gpu-nvidia", nvidiaSATJobs(), logFunc)
|
||||
}
|
||||
|
||||
// RunNvidiaAcceptancePackWithOptions runs the NVIDIA diagnostics via DCGM.
|
||||
// diagLevel: 1=quick, 2=medium, 3=targeted stress, 4=extended stress.
|
||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||
// ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
return runAcceptancePack(baseDir, "memory", []satJob{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
})
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
seconds := durationSec
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||
}
|
||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||
sizeArg := "80%"
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
sizeArg = fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||
"stress-ng", "--vm", "1",
|
||||
"--vm-bytes", sizeArg,
|
||||
"--vm-method", "all",
|
||||
"--timeout", fmt.Sprintf("%d", seconds),
|
||||
"--metrics-brief",
|
||||
}},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
seconds := durationSec
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_SAT_STRESS_SECONDS", 300)
|
||||
}
|
||||
cmd := []string{"stressapptest", "-s", fmt.Sprintf("%d", seconds), "-W", "--cc_test"}
|
||||
if mb := envInt("BEE_SAT_STRESS_MB", 0); mb > 0 {
|
||||
cmd = append(cmd, "-M", fmt.Sprintf("%d", mb))
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "sat-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stressapptest.log", cmd: cmd},
|
||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if durationSec <= 0 {
|
||||
durationSec = 60
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "cpu", []satJob{
|
||||
{name: "01-lscpu.log", cmd: []string{"lscpu"}},
|
||||
{name: "02-sensors-before.log", cmd: []string{"sensors"}},
|
||||
{name: "03-stress-ng.log", cmd: []string{"stress-ng", "--cpu", "0", "--cpu-method", "all", "--timeout", fmt.Sprintf("%d", durationSec)}},
|
||||
{name: "04-sensors-after.log", cmd: []string{"sensors"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -37,6 +358,7 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
devices, err := listStorageDevices()
|
||||
if err != nil {
|
||||
@@ -55,11 +377,17 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
}
|
||||
|
||||
for index, devPath := range devices {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath)
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||
out, err := exec.Command(job.cmd[0], job.cmd[1:]...).CombinedOutput()
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -83,8 +411,11 @@ func (s *System) RunStorageAcceptancePack(baseDir string) (string, error) {
|
||||
}
|
||||
|
||||
type satJob struct {
|
||||
name string
|
||||
cmd []string
|
||||
name string
|
||||
cmd []string
|
||||
env []string // extra env vars (appended to os.Environ)
|
||||
collectGPU bool // collect GPU metrics via nvidia-smi while this job runs
|
||||
gpuIndices []int // GPU indices to collect metrics for (empty = all)
|
||||
}
|
||||
|
||||
type satStats struct {
|
||||
@@ -94,18 +425,39 @@ type satStats struct {
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
if diagLevel < 1 || diagLevel > 4 {
|
||||
diagLevel = 3
|
||||
}
|
||||
diagArgs := []string{"dcgmi", "diag", "-r", strconv.Itoa(diagLevel)}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -114,16 +466,29 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
for _, arg := range job.cmd {
|
||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||
}
|
||||
out, err := exec.Command(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
|
||||
var out []byte
|
||||
var err error
|
||||
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
}
|
||||
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -145,20 +510,51 @@ func runAcceptancePack(baseDir, prefix string, jobs []satJob) (string, error) {
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||
}
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
out, err := streamExecOutput(c, logFunc)
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
fmt.Sprintf("rc: %d", rc),
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return out, err
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "NAME,TYPE").Output()
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) != 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices, nil
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
@@ -166,7 +562,7 @@ func storageSATCommands(devPath string) []satJob {
|
||||
return []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "--start", "1"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
||||
}
|
||||
}
|
||||
return []satJob{
|
||||
@@ -213,19 +609,238 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
}
|
||||
|
||||
text := strings.ToLower(string(out))
|
||||
// No output at all means the tool failed to start (mlock limit, binary missing,
|
||||
// etc.) — we cannot say anything about hardware health → UNSUPPORTED.
|
||||
if len(strings.TrimSpace(text)) == 0 {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
if strings.Contains(text, "unsupported") ||
|
||||
strings.Contains(text, "not supported") ||
|
||||
strings.Contains(text, "invalid opcode") ||
|
||||
strings.Contains(text, "unknown command") ||
|
||||
strings.Contains(text, "not implemented") ||
|
||||
strings.Contains(text, "not available") ||
|
||||
strings.Contains(text, "cuda_error_system_not_ready") ||
|
||||
strings.Contains(text, "no such device") ||
|
||||
// nvidia-smi on a machine with no NVIDIA GPU
|
||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||
strings.Contains(text, "no nvidia gpu") ||
|
||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
return "FAILED", rc
|
||||
}
|
||||
|
||||
func runSATCommand(verboseLog, name string, cmd []string, logFunc func(string)) ([]byte, error) {
|
||||
start := time.Now().UTC()
|
||||
resolvedCmd, err := resolveSATCommand(cmd)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s", start.Format(time.RFC3339), name),
|
||||
"cmd: "+strings.Join(resolvedCmd, " "),
|
||||
)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("=== %s ===", name))
|
||||
}
|
||||
if err != nil {
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
"rc: 1",
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return []byte(err.Error() + "\n"), err
|
||||
}
|
||||
|
||||
out, err := streamExecOutput(satExecCommand(resolvedCmd[0], resolvedCmd[1:]...), logFunc)
|
||||
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), name),
|
||||
fmt.Sprintf("rc: %d", rc),
|
||||
fmt.Sprintf("duration_ms: %d", time.Since(start).Milliseconds()),
|
||||
"",
|
||||
)
|
||||
return out, err
|
||||
}
|
||||
|
||||
func runROCmSMI(args ...string) ([]byte, error) {
|
||||
cmd, err := resolveROCmSMICommand(args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return satExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
||||
}
|
||||
|
||||
func resolveSATCommand(cmd []string) ([]string, error) {
|
||||
if len(cmd) == 0 {
|
||||
return nil, errors.New("empty SAT command")
|
||||
}
|
||||
switch cmd[0] {
|
||||
case "rocm-smi":
|
||||
return resolveROCmSMICommand(cmd[1:]...)
|
||||
case "rvs":
|
||||
return resolveRVSCommand(cmd[1:]...)
|
||||
}
|
||||
path, err := satLookPath(cmd[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||||
}
|
||||
return append([]string{path}, cmd[1:]...), nil
|
||||
}
|
||||
|
||||
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||
if path, err := satLookPath("rvs"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
for _, path := range expandExistingPaths(rvsExecutableGlobs) {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
return nil, errors.New("rvs not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||
if path, err := satLookPath("rocm-smi"); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
for _, path := range rocmSMIExecutableCandidates() {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
|
||||
pythonPath, pyErr := satLookPath("python3")
|
||||
if pyErr == nil {
|
||||
for _, script := range rocmSMIScriptCandidates() {
|
||||
cmd := []string{pythonPath, script}
|
||||
cmd = append(cmd, args...)
|
||||
return cmd, nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func ensureAMDRuntimeReady() error {
|
||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||
return nil
|
||||
}
|
||||
if raw, err := os.ReadFile("/sys/module/amdgpu/initstate"); err == nil {
|
||||
state := strings.TrimSpace(string(raw))
|
||||
if strings.EqualFold(state, "live") {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("AMD driver is present but not initialized: amdgpu initstate=%q", state)
|
||||
}
|
||||
return errors.New("AMD GPUs are present but the runtime is not initialized: /dev/kfd is missing and amdgpu is not loaded")
|
||||
}
|
||||
|
||||
func rocmSMIExecutableCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIExecutableGlobs)
|
||||
}
|
||||
|
||||
func rocmSMIScriptCandidates() []string {
|
||||
return expandExistingPaths(rocmSMIScriptGlobs)
|
||||
}
|
||||
|
||||
func expandExistingPaths(patterns []string) []string {
|
||||
seen := make(map[string]struct{})
|
||||
var paths []string
|
||||
for _, pattern := range patterns {
|
||||
matches, err := satGlob(pattern)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
for _, match := range matches {
|
||||
if _, err := satStat(match); err != nil {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[match]; ok {
|
||||
continue
|
||||
}
|
||||
seen[match] = struct{}{}
|
||||
paths = append(paths, match)
|
||||
}
|
||||
}
|
||||
return paths
|
||||
}
|
||||
|
||||
func parseStorageDevices(raw string) []string {
|
||||
var devices []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
fields := strings.Fields(strings.TrimSpace(line))
|
||||
if len(fields) < 2 || fields[1] != "disk" {
|
||||
continue
|
||||
}
|
||||
if len(fields) >= 3 && strings.EqualFold(fields[2], "usb") {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, "/dev/"+fields[0])
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// runSATCommandWithMetrics runs a command while collecting GPU metrics in the background.
|
||||
// On completion it writes gpu-metrics.csv and gpu-metrics.html into runDir.
|
||||
func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir string, logFunc func(string)) ([]byte, error) {
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
var metricRows []GPUMetricRow
|
||||
start := time.Now()
|
||||
|
||||
go func() {
|
||||
defer close(doneCh)
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
samples, err := sampleGPUMetrics(gpuIndices)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
elapsed := time.Since(start).Seconds()
|
||||
for i := range samples {
|
||||
samples[i].ElapsedSec = elapsed
|
||||
}
|
||||
metricRows = append(metricRows, samples...)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
||||
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
if len(metricRows) > 0 {
|
||||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||
chart := RenderGPUTerminalChart(metricRows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
||||
}
|
||||
|
||||
return out, err
|
||||
}
|
||||
|
||||
func appendSATVerboseLog(path string, lines ...string) {
|
||||
if path == "" {
|
||||
return
|
||||
}
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
for _, line := range lines {
|
||||
_, _ = io.WriteString(f, line+"\n")
|
||||
}
|
||||
}
|
||||
|
||||
func envInt(name string, fallback int) int {
|
||||
raw := strings.TrimSpace(os.Getenv(name))
|
||||
if raw == "" {
|
||||
|
||||
691
audit/internal/platform/sat_fan_stress.go
Normal file
691
audit/internal/platform/sat_fan_stress.go
Normal file
@@ -0,0 +1,691 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FanStressOptions configures the fan-stress / thermal cycling test.
|
||||
type FanStressOptions struct {
|
||||
BaselineSec int // idle monitoring before and after load (default 30)
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
// FanReading holds one fan sensor reading.
|
||||
type FanReading struct {
|
||||
Name string
|
||||
RPM float64
|
||||
}
|
||||
|
||||
// GPUStressMetric holds per-GPU metrics during the stress test.
|
||||
type GPUStressMetric struct {
|
||||
Index int
|
||||
TempC float64
|
||||
UsagePct float64
|
||||
PowerW float64
|
||||
ClockMHz float64
|
||||
Throttled bool // true if any throttle reason is active
|
||||
}
|
||||
|
||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||
type FanStressRow struct {
|
||||
TimestampUTC string
|
||||
ElapsedSec float64
|
||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||
GPUs []GPUStressMetric
|
||||
Fans []FanReading
|
||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
}
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanStressOptions) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
applyFanStressDefaults(&opts)
|
||||
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "fan-stress-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
// Phase name shared between sampler goroutine and main goroutine.
|
||||
var phaseMu sync.Mutex
|
||||
currentPhase := "init"
|
||||
setPhase := func(name string) {
|
||||
phaseMu.Lock()
|
||||
currentPhase = name
|
||||
phaseMu.Unlock()
|
||||
}
|
||||
getPhase := func() string {
|
||||
phaseMu.Lock()
|
||||
defer phaseMu.Unlock()
|
||||
return currentPhase
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
var rowsMu sync.Mutex
|
||||
var allRows []FanStressRow
|
||||
|
||||
// Start background sampler (every second).
|
||||
stopCh := make(chan struct{})
|
||||
doneCh := make(chan struct{})
|
||||
go func() {
|
||||
defer close(doneCh)
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
row := sampleFanStressRow(opts.GPUIndices, getPhase(), time.Since(start).Seconds())
|
||||
rowsMu.Lock()
|
||||
allRows = append(allRows, row)
|
||||
rowsMu.Unlock()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
var summary strings.Builder
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
|
||||
stats := satStats{}
|
||||
|
||||
// idlePhase sleeps for durSec while the sampler stamps phaseName on each row.
|
||||
idlePhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start %s (idle %ds)", time.Now().UTC().Format(time.RFC3339), stepName, durSec),
|
||||
)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-time.After(time.Duration(durSec) * time.Second):
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish %s", time.Now().UTC().Format(time.RFC3339), stepName),
|
||||
)
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
|
||||
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(durSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||
stats.Failed++
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "%s_status=OK\n", stepName)
|
||||
stats.OK++
|
||||
}
|
||||
}
|
||||
|
||||
// Execute test phases.
|
||||
idlePhase("baseline", "01-baseline", opts.BaselineSec)
|
||||
loadPhase("load1", "02-load1", opts.Phase1DurSec)
|
||||
idlePhase("pause", "03-pause", opts.PauseSec)
|
||||
loadPhase("load2", "04-load2", opts.Phase2DurSec)
|
||||
idlePhase("cooldown", "05-cooldown", opts.BaselineSec)
|
||||
|
||||
// Stop sampler and collect rows.
|
||||
close(stopCh)
|
||||
<-doneCh
|
||||
|
||||
rowsMu.Lock()
|
||||
rows := allRows
|
||||
rowsMu.Unlock()
|
||||
|
||||
// Analysis.
|
||||
throttled := analyzeThrottling(rows)
|
||||
maxGPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
var m float64
|
||||
for _, g := range r.GPUs {
|
||||
if g.TempC > m {
|
||||
m = g.TempC
|
||||
}
|
||||
}
|
||||
return m
|
||||
})
|
||||
maxCPUTemp := analyzeMaxTemp(rows, func(r FanStressRow) float64 {
|
||||
return r.CPUMaxTempC
|
||||
})
|
||||
fanResponseSec := analyzeFanResponse(rows)
|
||||
|
||||
fmt.Fprintf(&summary, "throttling_detected=%v\n", throttled)
|
||||
fmt.Fprintf(&summary, "max_gpu_temp_c=%.1f\n", maxGPUTemp)
|
||||
fmt.Fprintf(&summary, "max_cpu_temp_c=%.1f\n", maxCPUTemp)
|
||||
if fanResponseSec >= 0 {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=%.1f\n", fanResponseSec)
|
||||
} else {
|
||||
fmt.Fprintf(&summary, "fan_response_sec=N/A\n")
|
||||
}
|
||||
|
||||
// Throttling failure counts against overall result.
|
||||
if throttled {
|
||||
stats.Failed++
|
||||
}
|
||||
writeSATStats(&summary, stats)
|
||||
|
||||
// Write CSV outputs.
|
||||
if err := WriteFanStressCSV(filepath.Join(runDir, "metrics.csv"), rows, opts.GPUIndices); err != nil {
|
||||
return "", err
|
||||
}
|
||||
_ = WriteFanSensorsCSV(filepath.Join(runDir, "fan-sensors.csv"), rows)
|
||||
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.BaselineSec <= 0 {
|
||||
opts.BaselineSec = 30
|
||||
}
|
||||
if opts.Phase1DurSec <= 0 {
|
||||
opts.Phase1DurSec = 300
|
||||
}
|
||||
if opts.PauseSec <= 0 {
|
||||
opts.PauseSec = 60
|
||||
}
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStressRow {
|
||||
row := FanStressRow{
|
||||
TimestampUTC: time.Now().UTC().Format(time.RFC3339),
|
||||
ElapsedSec: elapsed,
|
||||
Phase: phase,
|
||||
}
|
||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||
row.Fans, _ = sampleFanSpeeds()
|
||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||
row.SysPowerW = sampleSystemPower()
|
||||
return row
|
||||
}
|
||||
|
||||
// sampleGPUStressMetrics queries nvidia-smi for temperature, utilization, power,
|
||||
// clock frequency, and active throttle reasons for each GPU.
|
||||
func sampleGPUStressMetrics(gpuIndices []int) []GPUStressMetric {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics,clocks_throttle_reasons.active",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
ids := make([]string, len(gpuIndices))
|
||||
for i, idx := range gpuIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
args = append([]string{"--id=" + strings.Join(ids, ",")}, args...)
|
||||
}
|
||||
out, err := exec.Command("nvidia-smi", args...).Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var metrics []GPUStressMetric
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
throttleVal := strings.TrimSpace(parts[5])
|
||||
// Throttled if active reasons bitmask is non-zero.
|
||||
throttled := throttleVal != "0x0000000000000000" &&
|
||||
throttleVal != "0x0" &&
|
||||
throttleVal != "0" &&
|
||||
throttleVal != "" &&
|
||||
throttleVal != "N/A"
|
||||
metrics = append(metrics, GPUStressMetric{
|
||||
Index: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
PowerW: parseGPUFloat(parts[3]),
|
||||
ClockMHz: parseGPUFloat(parts[4]),
|
||||
Throttled: throttled,
|
||||
})
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
// sampleFanSpeeds reads fan RPM values from ipmitool sdr.
|
||||
func sampleFanSpeeds() ([]FanReading, error) {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||
if err == nil {
|
||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||
return fans, nil
|
||||
}
|
||||
}
|
||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||
if len(fans) > 0 {
|
||||
return fans, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil, sensorsErr
|
||||
}
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
//
|
||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||
func parseFanSpeeds(raw string) []FanReading {
|
||||
var fans []FanReading
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 2 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
// Find the first field that contains "RPM" (either as a standalone unit or inline)
|
||||
rpmVal := 0.0
|
||||
found := false
|
||||
for _, p := range parts[1:] {
|
||||
p = strings.TrimSpace(p)
|
||||
if !strings.Contains(strings.ToUpper(p), "RPM") {
|
||||
continue
|
||||
}
|
||||
if strings.EqualFold(p, "RPM") {
|
||||
continue // unit-only column in old format; value is in previous field
|
||||
}
|
||||
val, err := parseFanRPMValue(p)
|
||||
if err == nil {
|
||||
rpmVal = val
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
// Old format: unit "RPM" is in col[2], value is in col[1]
|
||||
if !found && len(parts) >= 3 && strings.EqualFold(strings.TrimSpace(parts[2]), "RPM") {
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if !strings.EqualFold(valStr, "na") && !strings.EqualFold(valStr, "disabled") && valStr != "" {
|
||||
if val, err := parseFanRPMValue(valStr); err == nil {
|
||||
rpmVal = val
|
||||
found = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
fans = append(fans, FanReading{Name: name, RPM: rpmVal})
|
||||
}
|
||||
return fans
|
||||
}
|
||||
|
||||
func parseFanRPMValue(raw string) (float64, error) {
|
||||
fields := strings.Fields(strings.TrimSpace(strings.ReplaceAll(raw, ",", "")))
|
||||
if len(fields) == 0 {
|
||||
return 0, strconv.ErrSyntax
|
||||
}
|
||||
return strconv.ParseFloat(fields[0], 64)
|
||||
}
|
||||
|
||||
func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil, err
|
||||
}
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(out, &doc); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
chips := make([]string, 0, len(doc))
|
||||
for chip := range doc {
|
||||
chips = append(chips, chip)
|
||||
}
|
||||
sort.Strings(chips)
|
||||
var fans []FanReading
|
||||
seen := map[string]struct{}{}
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
names := make([]string, 0, len(features))
|
||||
for name := range features {
|
||||
names = append(names, name)
|
||||
}
|
||||
sort.Strings(names)
|
||||
for _, name := range names {
|
||||
feature, ok := features[name].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
rpm, ok := firstFanInputValue(feature)
|
||||
if !ok || rpm <= 0 {
|
||||
continue
|
||||
}
|
||||
label := strings.TrimSpace(name)
|
||||
if chip != "" && !strings.Contains(strings.ToLower(label), strings.ToLower(chip)) {
|
||||
label = chip + " / " + label
|
||||
}
|
||||
if _, ok := seen[label]; ok {
|
||||
continue
|
||||
}
|
||||
seen[label] = struct{}{}
|
||||
fans = append(fans, FanReading{Name: label, RPM: rpm})
|
||||
}
|
||||
}
|
||||
return fans, nil
|
||||
}
|
||||
|
||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "fan") || !strings.HasSuffix(lower, "_input") {
|
||||
continue
|
||||
}
|
||||
switch value := feature[key].(type) {
|
||||
case float64:
|
||||
return value, true
|
||||
case string:
|
||||
f, err := strconv.ParseFloat(value, 64)
|
||||
if err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// sampleCPUMaxTemp returns the highest CPU/inlet temperature from ipmitool or sensors.
|
||||
func sampleCPUMaxTemp() float64 {
|
||||
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
||||
if err != nil {
|
||||
return sampleCPUTempViaSensors()
|
||||
}
|
||||
return parseIPMIMaxTemp(string(out))
|
||||
}
|
||||
|
||||
// parseIPMIMaxTemp extracts the maximum temperature from "ipmitool sdr type Temperature".
|
||||
func parseIPMIMaxTemp(raw string) float64 {
|
||||
var max float64
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
if !strings.Contains(strings.ToLower(unit), "degrees") {
|
||||
continue
|
||||
}
|
||||
valStr := strings.TrimSpace(parts[1])
|
||||
if strings.EqualFold(valStr, "na") || valStr == "" {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(valStr, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleCPUTempViaSensors falls back to lm-sensors when ipmitool is unavailable.
|
||||
func sampleCPUTempViaSensors() float64 {
|
||||
out, err := exec.Command("sensors", "-u").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
var max float64
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
if !strings.HasSuffix(fields[0], "_input:") {
|
||||
continue
|
||||
}
|
||||
val, err := strconv.ParseFloat(fields[1], 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if val > 0 && val < 150 && val > max {
|
||||
max = val
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return parseDCMIPowerReading(string(out))
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
// Sample: " Instantaneous power reading: 500 Watts"
|
||||
func parseDCMIPowerReading(raw string) float64 {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if !strings.Contains(strings.ToLower(line), "instantaneous") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
for i, p := range parts {
|
||||
if strings.EqualFold(p, "Watts") && i > 0 {
|
||||
val, err := strconv.ParseFloat(parts[i-1], 64)
|
||||
if err == nil {
|
||||
return val
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||
// during either load phase.
|
||||
func analyzeThrottling(rows []FanStressRow) bool {
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" && row.Phase != "load2" {
|
||||
continue
|
||||
}
|
||||
for _, gpu := range row.GPUs {
|
||||
if gpu.Throttled {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// analyzeMaxTemp returns the maximum value of the given extractor across all rows.
|
||||
func analyzeMaxTemp(rows []FanStressRow, extract func(FanStressRow) float64) float64 {
|
||||
var max float64
|
||||
for _, row := range rows {
|
||||
if v := extract(row); v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
// analyzeFanResponse returns the seconds from load1 start until fan RPM first
|
||||
// increased by more than 5% above the baseline average. Returns -1 if undetermined.
|
||||
func analyzeFanResponse(rows []FanStressRow) float64 {
|
||||
// Compute baseline average fan RPM.
|
||||
var baseTotal, baseCount float64
|
||||
for _, row := range rows {
|
||||
if row.Phase != "baseline" {
|
||||
continue
|
||||
}
|
||||
for _, f := range row.Fans {
|
||||
baseTotal += f.RPM
|
||||
baseCount++
|
||||
}
|
||||
}
|
||||
if baseCount == 0 || baseTotal == 0 {
|
||||
return -1
|
||||
}
|
||||
baseAvg := baseTotal / baseCount
|
||||
threshold := baseAvg * 1.05 // 5% increase signals fan ramp-up
|
||||
|
||||
// Find elapsed time when load1 started.
|
||||
var load1Start float64 = -1
|
||||
for _, row := range rows {
|
||||
if row.Phase == "load1" {
|
||||
load1Start = row.ElapsedSec
|
||||
break
|
||||
}
|
||||
}
|
||||
if load1Start < 0 {
|
||||
return -1
|
||||
}
|
||||
|
||||
// Find first load1 row where average RPM crosses the threshold.
|
||||
for _, row := range rows {
|
||||
if row.Phase != "load1" {
|
||||
continue
|
||||
}
|
||||
var total, count float64
|
||||
for _, f := range row.Fans {
|
||||
total += f.RPM
|
||||
count++
|
||||
}
|
||||
if count > 0 && total/count >= threshold {
|
||||
return row.ElapsedSec - load1Start
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// WriteFanStressCSV writes the wide-format metrics CSV with one row per second.
|
||||
// GPU columns are generated per index in gpuIndices order.
|
||||
func WriteFanStressCSV(path string, rows []FanStressRow, gpuIndices []int) error {
|
||||
if len(rows) == 0 {
|
||||
return os.WriteFile(path, []byte("no data\n"), 0644)
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header: fixed system columns + per-GPU columns.
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_avg_rpm,fan_min_rpm,fan_max_rpm,cpu_max_temp_c,sys_power_w")
|
||||
for _, idx := range gpuIndices {
|
||||
fmt.Fprintf(&b, ",gpu%d_temp_c,gpu%d_usage_pct,gpu%d_power_w,gpu%d_clock_mhz,gpu%d_throttled",
|
||||
idx, idx, idx, idx, idx)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
|
||||
for _, row := range rows {
|
||||
favg, fmin, fmax := fanRPMStats(row.Fans)
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%.0f,%.0f,%.0f,%.1f,%.1f",
|
||||
row.TimestampUTC,
|
||||
row.ElapsedSec,
|
||||
row.Phase,
|
||||
favg, fmin, fmax,
|
||||
row.CPUMaxTempC,
|
||||
row.SysPowerW,
|
||||
)
|
||||
gpuByIdx := make(map[int]GPUStressMetric, len(row.GPUs))
|
||||
for _, g := range row.GPUs {
|
||||
gpuByIdx[g.Index] = g
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
g := gpuByIdx[idx]
|
||||
throttled := 0
|
||||
if g.Throttled {
|
||||
throttled = 1
|
||||
}
|
||||
fmt.Fprintf(&b, ",%.1f,%.1f,%.1f,%.0f,%d",
|
||||
g.TempC, g.UsagePct, g.PowerW, g.ClockMHz, throttled)
|
||||
}
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// WriteFanSensorsCSV writes individual fan sensor readings in long (tidy) format.
|
||||
func WriteFanSensorsCSV(path string, rows []FanStressRow) error {
|
||||
var b strings.Builder
|
||||
b.WriteString("timestamp_utc,elapsed_sec,phase,fan_name,rpm\n")
|
||||
for _, row := range rows {
|
||||
for _, f := range row.Fans {
|
||||
fmt.Fprintf(&b, "%s,%.1f,%s,%s,%.0f\n",
|
||||
row.TimestampUTC, row.ElapsedSec, row.Phase, f.Name, f.RPM)
|
||||
}
|
||||
}
|
||||
return os.WriteFile(path, []byte(b.String()), 0644)
|
||||
}
|
||||
|
||||
// fanRPMStats computes average, min, max RPM across all fans in a sample row.
|
||||
func fanRPMStats(fans []FanReading) (avg, min, max float64) {
|
||||
if len(fans) == 0 {
|
||||
return 0, 0, 0
|
||||
}
|
||||
min = fans[0].RPM
|
||||
max = fans[0].RPM
|
||||
var total float64
|
||||
for _, f := range fans {
|
||||
total += f.RPM
|
||||
if f.RPM < min {
|
||||
min = f.RPM
|
||||
}
|
||||
if f.RPM > max {
|
||||
max = f.RPM
|
||||
}
|
||||
}
|
||||
return total / float64(len(fans)), min, max
|
||||
}
|
||||
27
audit/internal/platform/sat_fan_stress_test.go
Normal file
27
audit/internal/platform/sat_fan_stress_test.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseFanSpeeds(t *testing.T) {
|
||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||
got := parseFanSpeeds(raw)
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("fans=%d want 2 (%v)", len(got), got)
|
||||
}
|
||||
if got[0].Name != "FAN1" || got[0].RPM != 2400 {
|
||||
t.Fatalf("fan0=%+v", got[0])
|
||||
}
|
||||
if got[1].Name != "FAN2" || got[1].RPM != 1800 {
|
||||
t.Fatalf("fan1=%+v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFirstFanInputValue(t *testing.T) {
|
||||
feature := map[string]any{
|
||||
"fan1_input": 9200.0,
|
||||
}
|
||||
got, ok := firstFanInputValue(feature)
|
||||
if !ok || got != 9200 {
|
||||
t.Fatalf("got=%v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,9 @@ package platform
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -28,18 +31,59 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := amdStressRVSConfig(123)
|
||||
if !strings.Contains(cfg, "module: gst") {
|
||||
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||
}
|
||||
if strings.Contains(cfg, "module: mem") {
|
||||
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||
}
|
||||
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||
}
|
||||
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||
if !strings.Contains(cfg, field) {
|
||||
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||
if len(jobs) != 4 {
|
||||
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||
}
|
||||
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||
}
|
||||
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||
}
|
||||
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
}
|
||||
@@ -50,6 +94,93 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 600,
|
||||
Loader: NvidiaStressLoaderJohn,
|
||||
ExcludeGPUIndices: []int{1},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 120,
|
||||
Loader: NvidiaStressLoaderNCCL,
|
||||
GPUIndices: []int{2, 0},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||
{loader: "", want: "gpu-nvidia-burn"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvIntFallback(t *testing.T) {
|
||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
@@ -75,7 +206,8 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -87,3 +219,128 @@ func TestClassifySATResult(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := "nvme0n1 disk nvme\nsda disk usb\nloop0 loop\nsdb disk sata\n"
|
||||
got := parseStorageDevices(raw)
|
||||
want := []string{"/dev/nvme0n1", "/dev/sdb"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len(devices)=%d want %d (%v)", len(got), len(want), got)
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("devices[%d]=%q want %q", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||
t.Setenv("PATH", t.TempDir())
|
||||
|
||||
toolPath := filepath.Join(os.Getenv("PATH"), "rocm-smi")
|
||||
if err := os.WriteFile(toolPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showproductname")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != toolPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], toolPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "stress-ng" {
|
||||
return "/usr/bin/stress-ng", nil
|
||||
}
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveSATCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 3 {
|
||||
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/stress-ng" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||
t.Fatalf("error=%q", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||
if err := os.MkdirAll(filepath.Dir(execPath), 0755); err != nil {
|
||||
t.Fatalf("mkdir: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(execPath, []byte("#!/bin/sh\nexit 0\n"), 0755); err != nil {
|
||||
t.Fatalf("write rocm-smi: %v", err)
|
||||
}
|
||||
|
||||
oldGlob := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
rocmSMIExecutableGlobs = []string{execPath}
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
rocmSMIExecutableGlobs = oldGlob
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
t.Setenv("PATH", "")
|
||||
|
||||
cmd, err := resolveROCmSMICommand("--showallinfo")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveROCmSMICommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 2 {
|
||||
t.Fatalf("cmd len=%d want 2 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != execPath {
|
||||
t.Fatalf("cmd[0]=%q want %q", cmd[0], execPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunROCmSMIReportsMissingCommand(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
oldExecGlobs := rocmSMIExecutableGlobs
|
||||
oldScriptGlobs := rocmSMIScriptGlobs
|
||||
satLookPath = func(string) (string, error) { return "", exec.ErrNotFound }
|
||||
rocmSMIExecutableGlobs = nil
|
||||
rocmSMIScriptGlobs = nil
|
||||
t.Cleanup(func() {
|
||||
satLookPath = oldLookPath
|
||||
rocmSMIExecutableGlobs = oldExecGlobs
|
||||
rocmSMIScriptGlobs = oldScriptGlobs
|
||||
})
|
||||
|
||||
if _, err := runROCmSMI("--showproductname"); err == nil {
|
||||
t.Fatal("expected missing rocm-smi error")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,10 @@ func (s *System) ListBeeServices() ([]string, error) {
|
||||
}
|
||||
for _, match := range matches {
|
||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||
if strings.HasSuffix(name, "@") {
|
||||
continue
|
||||
}
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
out = append(out, name)
|
||||
|
||||
150
audit/internal/platform/techdump.go
Normal file
150
audit/internal/platform/techdump.go
Normal file
@@ -0,0 +1,150 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var techDumpFixedCommands = []struct {
|
||||
Name string
|
||||
Args []string
|
||||
File string
|
||||
}{
|
||||
{Name: "dmidecode", Args: []string{"-t", "0"}, File: "dmidecode-type0.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "1"}, File: "dmidecode-type1.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "2"}, File: "dmidecode-type2.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
var techDumpNvidiaCommands = []struct {
|
||||
Name string
|
||||
Args []string
|
||||
File string
|
||||
}{
|
||||
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||
}
|
||||
|
||||
type lsblkDumpRoot struct {
|
||||
Blockdevices []struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Tran string `json:"tran"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
|
||||
type nvmeDumpRoot struct {
|
||||
Devices []struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
} `json:"Devices"`
|
||||
}
|
||||
|
||||
func (s *System) CaptureTechnicalDump(baseDir string) error {
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, cmd := range techDumpFixedCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
switch s.DetectGPUVendor() {
|
||||
case "nvidia":
|
||||
for _, cmd := range techDumpNvidiaCommands {
|
||||
writeCommandDump(filepath.Join(baseDir, cmd.File), cmd.Name, cmd.Args...)
|
||||
}
|
||||
case "amd":
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi.txt"))
|
||||
writeROCmSMIDump(filepath.Join(baseDir, "rocm-smi-showallinfo.txt"), "--showallinfo")
|
||||
}
|
||||
|
||||
for _, dev := range lsblkDumpDevices(filepath.Join(baseDir, "lsblk.json")) {
|
||||
writeCommandDump(filepath.Join(baseDir, "smartctl-"+sanitizeDumpName(dev)+".json"), "smartctl", "-j", "-a", "/dev/"+dev)
|
||||
}
|
||||
for _, dev := range nvmeDumpDevices(filepath.Join(baseDir, "nvme-list.json")) {
|
||||
writeCommandDump(filepath.Join(baseDir, "nvme-id-ctrl-"+sanitizeDumpName(dev)+".json"), "nvme", "id-ctrl", dev, "-o", "json")
|
||||
writeCommandDump(filepath.Join(baseDir, "nvme-smart-log-"+sanitizeDumpName(dev)+".json"), "nvme", "smart-log", dev, "-o", "json")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeCommandDump(path, name string, args ...string) {
|
||||
out, err := exec.Command(name, args...).CombinedOutput()
|
||||
if err != nil && len(out) == 0 {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func writeROCmSMIDump(path string, args ...string) {
|
||||
out, err := runROCmSMI(args...)
|
||||
if err != nil && len(out) == 0 {
|
||||
return
|
||||
}
|
||||
_ = os.WriteFile(path, out, 0644)
|
||||
}
|
||||
|
||||
func lsblkDumpDevices(path string) []string {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var root lsblkDumpRoot
|
||||
if err := json.Unmarshal(raw, &root); err != nil {
|
||||
return nil
|
||||
}
|
||||
var devices []string
|
||||
for _, dev := range root.Blockdevices {
|
||||
if strings.EqualFold(strings.TrimSpace(dev.Tran), "usb") {
|
||||
continue
|
||||
}
|
||||
if dev.Type == "disk" && strings.TrimSpace(dev.Name) != "" {
|
||||
devices = append(devices, strings.TrimSpace(dev.Name))
|
||||
}
|
||||
}
|
||||
sort.Strings(devices)
|
||||
return devices
|
||||
}
|
||||
|
||||
func nvmeDumpDevices(path string) []string {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
var root nvmeDumpRoot
|
||||
if err := json.Unmarshal(raw, &root); err != nil {
|
||||
return nil
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
var devices []string
|
||||
for _, dev := range root.Devices {
|
||||
name := strings.TrimSpace(dev.DevicePath)
|
||||
if name == "" || seen[name] {
|
||||
continue
|
||||
}
|
||||
seen[name] = true
|
||||
devices = append(devices, name)
|
||||
}
|
||||
sort.Strings(devices)
|
||||
return devices
|
||||
}
|
||||
|
||||
func sanitizeDumpName(value string) string {
|
||||
value = strings.TrimSpace(value)
|
||||
value = strings.TrimPrefix(value, "/dev/")
|
||||
value = strings.ReplaceAll(value, "/", "_")
|
||||
if value == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return value
|
||||
}
|
||||
48
audit/internal/platform/techdump_test.go
Normal file
48
audit/internal/platform/techdump_test.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLSBLKDumpDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "lsblk.json")
|
||||
if err := os.WriteFile(path, []byte(`{"blockdevices":[{"name":"sda","type":"disk","tran":"usb"},{"name":"sda1","type":"part"},{"name":"nvme0n1","type":"disk","tran":"nvme"},{"name":"sdb","type":"disk","tran":"sata"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write lsblk fixture: %v", err)
|
||||
}
|
||||
|
||||
got := lsblkDumpDevices(path)
|
||||
want := []string{"nvme0n1", "sdb"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("lsblkDumpDevices=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNVMEDumpDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "nvme-list.json")
|
||||
if err := os.WriteFile(path, []byte(`{"Devices":[{"DevicePath":"/dev/nvme1n1"},{"DevicePath":"/dev/nvme0n1"},{"DevicePath":"/dev/nvme1n1"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write nvme fixture: %v", err)
|
||||
}
|
||||
|
||||
got := nvmeDumpDevices(path)
|
||||
want := []string{"/dev/nvme0n1", "/dev/nvme1n1"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("nvmeDumpDevices=%v want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSanitizeDumpName(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := sanitizeDumpName("/dev/nvme0n1"); got != "nvme0n1" {
|
||||
t.Fatalf("sanitizeDumpName=%q want nvme0n1", got)
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,18 @@ type InterfaceInfo struct {
|
||||
IPv4 []string
|
||||
}
|
||||
|
||||
type NetworkInterfaceSnapshot struct {
|
||||
Name string
|
||||
Up bool
|
||||
IPv4 []string
|
||||
}
|
||||
|
||||
type NetworkSnapshot struct {
|
||||
Interfaces []NetworkInterfaceSnapshot
|
||||
DefaultRoutes []string
|
||||
ResolvConf string
|
||||
}
|
||||
|
||||
type ServiceAction string
|
||||
|
||||
const (
|
||||
@@ -39,6 +51,20 @@ type ToolStatus struct {
|
||||
OK bool
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaStressLoaderBuiltin = "builtin"
|
||||
NvidiaStressLoaderJohn = "john"
|
||||
NvidiaStressLoaderNCCL = "nccl"
|
||||
)
|
||||
|
||||
type NvidiaStressOptions struct {
|
||||
DurationSec int
|
||||
SizeMB int
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
return &System{}
|
||||
}
|
||||
|
||||
@@ -10,9 +10,47 @@ type HardwareIngestRequest struct {
|
||||
Protocol *string `json:"protocol,omitempty"`
|
||||
TargetHost *string `json:"target_host,omitempty"`
|
||||
CollectedAt string `json:"collected_at"`
|
||||
Runtime *RuntimeHealth `json:"runtime,omitempty"`
|
||||
Hardware HardwareSnapshot `json:"hardware"`
|
||||
}
|
||||
|
||||
type RuntimeHealth struct {
|
||||
Status string `json:"status"`
|
||||
CheckedAt string `json:"checked_at"`
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
Interfaces []RuntimeInterface `json:"interfaces,omitempty"`
|
||||
}
|
||||
|
||||
type RuntimeIssue struct {
|
||||
Code string `json:"code"`
|
||||
Severity string `json:"severity,omitempty"`
|
||||
Description string `json:"description"`
|
||||
}
|
||||
|
||||
type RuntimeToolStatus struct {
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path,omitempty"`
|
||||
OK bool `json:"ok"`
|
||||
}
|
||||
|
||||
type RuntimeServiceStatus struct {
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
type RuntimeInterface struct {
|
||||
Name string `json:"name"`
|
||||
State string `json:"state,omitempty"`
|
||||
IPv4 []string `json:"ipv4,omitempty"`
|
||||
Outcome string `json:"outcome,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareSnapshot struct {
|
||||
Board HardwareBoard `json:"board"`
|
||||
Firmware []HardwareFirmwareRecord `json:"firmware,omitempty"`
|
||||
@@ -22,6 +60,7 @@ type HardwareSnapshot struct {
|
||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -148,7 +187,7 @@ type HardwarePCIeDevice struct {
|
||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||
SFPVoltageV *float64 `json:"sfp_voltage_v,omitempty"`
|
||||
SFPBiasMA *float64 `json:"sfp_bias_ma,omitempty"`
|
||||
BDF *string `json:"bdf,omitempty"`
|
||||
BDF *string `json:"-"`
|
||||
DeviceClass *string `json:"device_class,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
@@ -183,11 +222,12 @@ type HardwarePowerSupply struct {
|
||||
}
|
||||
|
||||
type HardwareComponentStatus struct {
|
||||
Status *string `json:"status,omitempty"`
|
||||
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
|
||||
StatusChangedAt *string `json:"status_changed_at,omitempty"`
|
||||
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
|
||||
StatusChangedAt *string `json:"status_changed_at,omitempty"`
|
||||
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
ManufacturedYearWeek *string `json:"manufactured_year_week,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareStatusHistory struct {
|
||||
@@ -235,3 +275,15 @@ type HardwareOtherSensor struct {
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareEventLog struct {
|
||||
Source string `json:"source"`
|
||||
EventTime *string `json:"event_time,omitempty"`
|
||||
Severity *string `json:"severity,omitempty"`
|
||||
MessageID *string `json:"message_id,omitempty"`
|
||||
Message string `json:"message"`
|
||||
ComponentRef *string `json:"component_ref,omitempty"`
|
||||
Fingerprint *string `json:"fingerprint,omitempty"`
|
||||
IsActive *bool `json:"is_active,omitempty"`
|
||||
RawPayload map[string]any `json:"raw_payload,omitempty"`
|
||||
}
|
||||
|
||||
46
audit/internal/schema/hardware_test.go
Normal file
46
audit/internal/schema/hardware_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package schema
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
||||
week := "2024-W07"
|
||||
eventTime := "2026-03-15T14:03:11Z"
|
||||
message := "Correctable ECC error threshold exceeded"
|
||||
|
||||
payload := HardwareIngestRequest{
|
||||
CollectedAt: "2026-03-15T15:00:00Z",
|
||||
Hardware: HardwareSnapshot{
|
||||
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||
CPUs: []HardwareCPU{
|
||||
{
|
||||
HardwareComponentStatus: HardwareComponentStatus{
|
||||
ManufacturedYearWeek: &week,
|
||||
},
|
||||
},
|
||||
},
|
||||
EventLogs: []HardwareEventLog{
|
||||
{
|
||||
Source: "bmc",
|
||||
EventTime: &eventTime,
|
||||
Message: message,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
if !strings.Contains(text, `"manufactured_year_week":"2024-W07"`) {
|
||||
t.Fatalf("missing manufactured_year_week: %s", text)
|
||||
}
|
||||
if !strings.Contains(text, `"event_logs":[{"source":"bmc","event_time":"2026-03-15T14:03:11Z","message":"Correctable ECC error threshold exceeded"}]`) {
|
||||
t.Fatalf("missing event_logs payload: %s", text)
|
||||
}
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) updateStaticForm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "esc":
|
||||
m.screen = screenNetwork
|
||||
m.formFields = nil
|
||||
m.formIndex = 0
|
||||
return m, nil
|
||||
case "up", "shift+tab":
|
||||
if m.formIndex > 0 {
|
||||
m.formIndex--
|
||||
}
|
||||
case "down", "tab":
|
||||
if m.formIndex < len(m.formFields)-1 {
|
||||
m.formIndex++
|
||||
}
|
||||
case "enter":
|
||||
if m.formIndex < len(m.formFields)-1 {
|
||||
m.formIndex++
|
||||
return m, nil
|
||||
}
|
||||
cfg := m.app.ParseStaticIPv4Config(m.selectedIface, []string{
|
||||
m.formFields[0].Value,
|
||||
m.formFields[1].Value,
|
||||
m.formFields[2].Value,
|
||||
m.formFields[3].Value,
|
||||
})
|
||||
m.busy = true
|
||||
m.busyTitle = "Static IPv4: " + m.selectedIface
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.SetStaticIPv4Result(cfg)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case "backspace":
|
||||
field := &m.formFields[m.formIndex]
|
||||
if len(field.Value) > 0 {
|
||||
field.Value = field.Value[:len(field.Value)-1]
|
||||
}
|
||||
default:
|
||||
if msg.Type == tea.KeyRunes && len(msg.Runes) > 0 {
|
||||
m.formFields[m.formIndex].Value += string(msg.Runes)
|
||||
}
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateConfirm(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch msg.String() {
|
||||
case "left", "up", "tab":
|
||||
if m.cursor > 0 {
|
||||
m.cursor--
|
||||
}
|
||||
case "right", "down":
|
||||
if m.cursor < 1 {
|
||||
m.cursor++
|
||||
}
|
||||
case "esc":
|
||||
m.screen = m.confirmCancelTarget()
|
||||
m.cursor = 0
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
case "enter":
|
||||
if m.cursor == 1 {
|
||||
m.screen = m.confirmCancelTarget()
|
||||
m.cursor = 0
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
}
|
||||
m.busy = true
|
||||
switch m.pendingAction {
|
||||
case actionExportAudit:
|
||||
m.busyTitle = "Export audit"
|
||||
target := *m.selectedTarget
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.ExportLatestAuditResult(target)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
|
||||
}
|
||||
case actionRunNvidiaSAT:
|
||||
m.busyTitle = "NVIDIA SAT"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunNvidiaAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
|
||||
}
|
||||
case actionRunMemorySAT:
|
||||
m.busyTitle = "Memory SAT"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunMemoryAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
|
||||
}
|
||||
case actionRunStorageSAT:
|
||||
m.busyTitle = "Storage SAT"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunStorageAcceptancePackResult("")
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenAcceptance}
|
||||
}
|
||||
}
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) confirmCancelTarget() screen {
|
||||
switch m.pendingAction {
|
||||
case actionExportAudit:
|
||||
return screenExportTargets
|
||||
case actionRunNvidiaSAT:
|
||||
fallthrough
|
||||
case actionRunMemorySAT:
|
||||
fallthrough
|
||||
case actionRunStorageSAT:
|
||||
return screenAcceptance
|
||||
default:
|
||||
return screenMain
|
||||
}
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
package tui
|
||||
|
||||
import "bee/audit/internal/platform"
|
||||
|
||||
type resultMsg struct {
|
||||
title string
|
||||
body string
|
||||
err error
|
||||
back screen
|
||||
}
|
||||
|
||||
type servicesMsg struct {
|
||||
services []string
|
||||
err error
|
||||
}
|
||||
|
||||
type interfacesMsg struct {
|
||||
ifaces []platform.InterfaceInfo
|
||||
err error
|
||||
}
|
||||
|
||||
type exportTargetsMsg struct {
|
||||
targets []platform.RemovableTarget
|
||||
err error
|
||||
}
|
||||
|
||||
type bannerMsg struct {
|
||||
text string
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) handleAcceptanceMenu() (tea.Model, tea.Cmd) {
|
||||
if m.cursor == 3 {
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.pendingAction = actionRunNvidiaSAT
|
||||
case 1:
|
||||
m.pendingAction = actionRunMemorySAT
|
||||
case 2:
|
||||
m.pendingAction = actionRunStorageSAT
|
||||
}
|
||||
m.screen = screenConfirm
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
package tui
|
||||
|
||||
import tea "github.com/charmbracelet/bubbletea"
|
||||
|
||||
func (m model) handleExportTargetsMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.targets) == 0 {
|
||||
return m, resultCmd("Export audit", "No removable filesystems found", nil, screenMain)
|
||||
}
|
||||
target := m.targets[m.cursor]
|
||||
m.selectedTarget = &target
|
||||
m.pendingAction = actionExportAudit
|
||||
m.screen = screenConfirm
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,63 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleMainMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.screen = screenNetwork
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case 1:
|
||||
m.busy = true
|
||||
m.busyTitle = "Services"
|
||||
return m, func() tea.Msg {
|
||||
services, err := m.app.ListBeeServices()
|
||||
return servicesMsg{services: services, err: err}
|
||||
}
|
||||
case 2:
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case 3:
|
||||
m.busy = true
|
||||
m.busyTitle = "Run audit"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.RunAuditNow(m.runtimeMode)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenMain}
|
||||
}
|
||||
case 4:
|
||||
m.busy = true
|
||||
m.busyTitle = "Export audit"
|
||||
return m, func() tea.Msg {
|
||||
targets, err := m.app.ListRemovableTargets()
|
||||
return exportTargetsMsg{targets: targets, err: err}
|
||||
}
|
||||
case 5:
|
||||
m.busy = true
|
||||
m.busyTitle = "Required tools"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.ToolCheckResult([]string{"dmidecode", "smartctl", "nvme", "ipmitool", "lspci", "ethtool", "bee", "nvidia-smi", "bee-gpu-stress", "memtester", "dhclient", "lsblk", "mount"})
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenMain}
|
||||
}
|
||||
case 6:
|
||||
m.busy = true
|
||||
m.busyTitle = "Health summary"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.HealthSummaryResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenMain}
|
||||
}
|
||||
case 7:
|
||||
m.busy = true
|
||||
m.busyTitle = "Audit logs"
|
||||
return m, func() tea.Msg {
|
||||
result := m.app.AuditLogTailResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, back: screenMain}
|
||||
}
|
||||
case 8:
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,76 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleNetworkMenu() (tea.Model, tea.Cmd) {
|
||||
switch m.cursor {
|
||||
case 0:
|
||||
m.busy = true
|
||||
m.busyTitle = "Network status"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.NetworkStatus()
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case 1:
|
||||
m.busy = true
|
||||
m.busyTitle = "DHCP all interfaces"
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.DHCPAllResult()
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case 2:
|
||||
m.pendingAction = actionDHCPOne
|
||||
m.busy = true
|
||||
m.busyTitle = "Interfaces"
|
||||
return m, func() tea.Msg {
|
||||
ifaces, err := m.app.ListInterfaces()
|
||||
return interfacesMsg{ifaces: ifaces, err: err}
|
||||
}
|
||||
case 3:
|
||||
m.pendingAction = actionStaticIPv4
|
||||
m.busy = true
|
||||
m.busyTitle = "Interfaces"
|
||||
return m, func() tea.Msg {
|
||||
ifaces, err := m.app.ListInterfaces()
|
||||
return interfacesMsg{ifaces: ifaces, err: err}
|
||||
}
|
||||
case 4:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) handleInterfacePickMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.interfaces) == 0 {
|
||||
return m, resultCmd("interfaces", "No physical interfaces found", nil, screenNetwork)
|
||||
}
|
||||
m.selectedIface = m.interfaces[m.cursor].Name
|
||||
switch m.pendingAction {
|
||||
case actionDHCPOne:
|
||||
m.busy = true
|
||||
m.busyTitle = "DHCP on " + m.selectedIface
|
||||
return m, func() tea.Msg {
|
||||
result, err := m.app.DHCPOneResult(m.selectedIface)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenNetwork}
|
||||
}
|
||||
case actionStaticIPv4:
|
||||
defaults := m.app.DefaultStaticIPv4FormFields(m.selectedIface)
|
||||
m.formFields = []formField{
|
||||
{Label: "IPv4 address", Value: defaults[0]},
|
||||
{Label: "Prefix", Value: defaults[1]},
|
||||
{Label: "Gateway", Value: strings.TrimSpace(defaults[2])},
|
||||
{Label: "DNS (space-separated)", Value: defaults[3]},
|
||||
}
|
||||
m.formIndex = 0
|
||||
m.screen = screenStaticForm
|
||||
return m, nil
|
||||
default:
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) handleServicesMenu() (tea.Model, tea.Cmd) {
|
||||
if len(m.services) == 0 {
|
||||
return m, resultCmd("Services", "No bee-* services found.", nil, screenMain)
|
||||
}
|
||||
m.selectedService = m.services[m.cursor]
|
||||
m.screen = screenServiceAction
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) handleServiceActionMenu() (tea.Model, tea.Cmd) {
|
||||
action := m.serviceMenu[m.cursor]
|
||||
if action == "back" {
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
}
|
||||
|
||||
m.busy = true
|
||||
m.busyTitle = "service: " + m.selectedService
|
||||
return m, func() tea.Msg {
|
||||
switch action {
|
||||
case "Status":
|
||||
result, err := m.app.ServiceStatusResult(m.selectedService)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Restart":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceRestart)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Start":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStart)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
case "Stop":
|
||||
result, err := m.app.ServiceActionResult(m.selectedService, platform.ServiceStop)
|
||||
return resultMsg{title: result.Title, body: result.Body, err: err, back: screenServiceAction}
|
||||
default:
|
||||
return resultMsg{title: "Service", body: "Unknown action.", back: screenServiceAction}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,588 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func newTestModel() model {
|
||||
return newModel(app.New(platform.New()), runtimeenv.ModeLocal)
|
||||
}
|
||||
|
||||
func sendKey(t *testing.T, m model, key tea.KeyType) model {
|
||||
t.Helper()
|
||||
|
||||
next, _ := m.Update(tea.KeyMsg{Type: key})
|
||||
return next.(model)
|
||||
}
|
||||
|
||||
func TestUpdateMainMenuCursorNavigation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
m = sendKey(t, m, tea.KeyDown)
|
||||
if m.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after down", m.cursor)
|
||||
}
|
||||
|
||||
m = sendKey(t, m, tea.KeyDown)
|
||||
if m.cursor != 2 {
|
||||
t.Fatalf("cursor=%d want 2 after second down", m.cursor)
|
||||
}
|
||||
|
||||
m = sendKey(t, m, tea.KeyUp)
|
||||
if m.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after up", m.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateMainMenuEnterActions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
wantScreen screen
|
||||
wantBusy bool
|
||||
wantCmd bool
|
||||
}{
|
||||
{name: "network", cursor: 0, wantScreen: screenNetwork},
|
||||
{name: "services", cursor: 1, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
{name: "acceptance", cursor: 2, wantScreen: screenAcceptance},
|
||||
{name: "run audit", cursor: 3, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
{name: "export", cursor: 4, wantScreen: screenMain, wantBusy: true, wantCmd: true},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.busy != test.wantBusy {
|
||||
t.Fatalf("busy=%v want %v", got.busy, test.wantBusy)
|
||||
}
|
||||
if (cmd != nil) != test.wantCmd {
|
||||
t.Fatalf("cmd present=%v want %v", cmd != nil, test.wantCmd)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateConfirmCancelViaKeys(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
m.pendingAction = actionRunNvidiaSAT
|
||||
|
||||
next, _ := m.Update(tea.KeyMsg{Type: tea.KeyRight})
|
||||
got := next.(model)
|
||||
if got.cursor != 1 {
|
||||
t.Fatalf("cursor=%d want 1 after right", got.cursor)
|
||||
}
|
||||
|
||||
next, _ = got.Update(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got = next.(model)
|
||||
if got.screen != screenAcceptance {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenAcceptance)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0 after cancel", got.cursor)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainMenuSimpleTransitions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
wantScreen screen
|
||||
}{
|
||||
{name: "network", cursor: 0, wantScreen: screenNetwork},
|
||||
{name: "acceptance", cursor: 2, wantScreen: screenAcceptance},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.handleMainMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatalf("expected nil cmd for %s", test.name)
|
||||
}
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainMenuAsyncActionsSetBusy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cursor int
|
||||
}{
|
||||
{name: "services", cursor: 1},
|
||||
{name: "run audit", cursor: 3},
|
||||
{name: "export", cursor: 4},
|
||||
{name: "check tools", cursor: 5},
|
||||
{name: "health summary", cursor: 6},
|
||||
{name: "log tail", cursor: 7},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, cmd := m.handleMainMenu()
|
||||
got := next.(model)
|
||||
|
||||
if !got.busy {
|
||||
t.Fatalf("busy=false for %s", test.name)
|
||||
}
|
||||
if cmd == nil {
|
||||
t.Fatalf("expected async cmd for %s", test.name)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainViewIncludesBanner(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.banner = "System: Test Server | S/N ABC123\nIP: 10.0.0.10"
|
||||
|
||||
view := m.View()
|
||||
if !strings.Contains(view, "System: Test Server | S/N ABC123") {
|
||||
t.Fatalf("view missing system banner:\n%s", view)
|
||||
}
|
||||
if !strings.Contains(view, "IP: 10.0.0.10") {
|
||||
t.Fatalf("view missing ip banner:\n%s", view)
|
||||
}
|
||||
if !strings.Contains(view, "Select action") {
|
||||
t.Fatalf("view missing menu subtitle:\n%s", view)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEscapeNavigation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
screen screen
|
||||
wantScreen screen
|
||||
}{
|
||||
{name: "network to main", screen: screenNetwork, wantScreen: screenMain},
|
||||
{name: "services to main", screen: screenServices, wantScreen: screenMain},
|
||||
{name: "acceptance to main", screen: screenAcceptance, wantScreen: screenMain},
|
||||
{name: "service action to services", screen: screenServiceAction, wantScreen: screenServices},
|
||||
{name: "export targets to main", screen: screenExportTargets, wantScreen: screenMain},
|
||||
{name: "interface pick to network", screen: screenInterfacePick, wantScreen: screenNetwork},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = test.screen
|
||||
m.cursor = 3
|
||||
|
||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != test.wantScreen {
|
||||
t.Fatalf("screen=%q want %q", got.screen, test.wantScreen)
|
||||
}
|
||||
if got.cursor != 0 {
|
||||
t.Fatalf("cursor=%d want 0", got.cursor)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutputScreenReturnsToPreviousScreen(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenOutput
|
||||
m.prevScreen = screenNetwork
|
||||
m.title = "title"
|
||||
m.body = "body"
|
||||
|
||||
next, _ := m.updateKey(tea.KeyMsg{Type: tea.KeyEnter})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenNetwork {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenNetwork)
|
||||
}
|
||||
if got.title != "" || got.body != "" {
|
||||
t.Fatalf("expected output state cleared, got title=%q body=%q", got.title, got.body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAcceptanceConfirmFlow(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = 0
|
||||
|
||||
next, cmd := m.handleAcceptanceMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
||||
}
|
||||
if got.pendingAction != actionRunNvidiaSAT {
|
||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionRunNvidiaSAT)
|
||||
}
|
||||
|
||||
next, _ = got.updateConfirm(tea.KeyMsg{Type: tea.KeyEsc})
|
||||
got = next.(model)
|
||||
if got.screen != screenAcceptance {
|
||||
t.Fatalf("screen after esc=%q want %q", got.screen, screenAcceptance)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAcceptanceMenuMapsNewTargets(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
cursor int
|
||||
want actionKind
|
||||
}{
|
||||
{cursor: 0, want: actionRunNvidiaSAT},
|
||||
{cursor: 1, want: actionRunMemorySAT},
|
||||
{cursor: 2, want: actionRunStorageSAT},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
m := newTestModel()
|
||||
m.screen = screenAcceptance
|
||||
m.cursor = test.cursor
|
||||
|
||||
next, _ := m.handleAcceptanceMenu()
|
||||
got := next.(model)
|
||||
if got.pendingAction != test.want {
|
||||
t.Fatalf("cursor=%d pendingAction=%q want %q", test.cursor, got.pendingAction, test.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportTargetSelectionOpensConfirm(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenExportTargets
|
||||
m.targets = []platform.RemovableTarget{{Device: "/dev/sdb1", FSType: "vfat", Size: "16G"}}
|
||||
|
||||
next, cmd := m.handleExportTargetsMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenConfirm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenConfirm)
|
||||
}
|
||||
if got.pendingAction != actionExportAudit {
|
||||
t.Fatalf("pendingAction=%q want %q", got.pendingAction, actionExportAudit)
|
||||
}
|
||||
if got.selectedTarget == nil || got.selectedTarget.Device != "/dev/sdb1" {
|
||||
t.Fatalf("selectedTarget=%+v want /dev/sdb1", got.selectedTarget)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInterfacePickStaticIPv4OpensForm(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.pendingAction = actionStaticIPv4
|
||||
m.interfaces = []platform.InterfaceInfo{{Name: "eth0"}}
|
||||
|
||||
next, cmd := m.handleInterfacePickMenu()
|
||||
got := next.(model)
|
||||
|
||||
if cmd != nil {
|
||||
t.Fatal("expected nil cmd")
|
||||
}
|
||||
if got.screen != screenStaticForm {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenStaticForm)
|
||||
}
|
||||
if got.selectedIface != "eth0" {
|
||||
t.Fatalf("selectedIface=%q want eth0", got.selectedIface)
|
||||
}
|
||||
if len(got.formFields) != 4 {
|
||||
t.Fatalf("len(formFields)=%d want 4", len(got.formFields))
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgUsesExplicitBackScreen(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "done", body: "ok", back: screenNetwork})
|
||||
got := next.(model)
|
||||
|
||||
if got.screen != screenOutput {
|
||||
t.Fatalf("screen=%q want %q", got.screen, screenOutput)
|
||||
}
|
||||
if got.prevScreen != screenNetwork {
|
||||
t.Fatalf("prevScreen=%q want %q", got.prevScreen, screenNetwork)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfirmCancelTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
m.pendingAction = actionExportAudit
|
||||
if got := m.confirmCancelTarget(); got != screenExportTargets {
|
||||
t.Fatalf("export cancel target=%q want %q", got, screenExportTargets)
|
||||
}
|
||||
|
||||
m.pendingAction = actionRunNvidiaSAT
|
||||
if got := m.confirmCancelTarget(); got != screenAcceptance {
|
||||
t.Fatalf("sat cancel target=%q want %q", got, screenAcceptance)
|
||||
}
|
||||
|
||||
m.pendingAction = actionNone
|
||||
if got := m.confirmCancelTarget(); got != screenMain {
|
||||
t.Fatalf("default cancel target=%q want %q", got, screenMain)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewMainMenuRendersSelectedItem(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.cursor = 1
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"bee",
|
||||
"Select action",
|
||||
" Network",
|
||||
"> Services",
|
||||
"Acceptance tests",
|
||||
"[↑/↓] move [enter] select [esc] back [ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewBusyStateIsMinimal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
|
||||
view := m.View()
|
||||
want := "bee\n\nWorking...\n\n[ctrl+c] quit\n"
|
||||
if view != want {
|
||||
t.Fatalf("busy view mismatch\nwant:\n%s\ngot:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewBusyStateUsesBusyTitle(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
m.busyTitle = "Export audit"
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export audit",
|
||||
"Working...",
|
||||
"[ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewOutputScreenRendersBodyAndBackHint(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenOutput
|
||||
m.title = "Run audit"
|
||||
m.body = "audit output: /var/log/bee-audit.json\n"
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Run audit",
|
||||
"audit output: /var/log/bee-audit.json",
|
||||
"[enter/esc] back [ctrl+c] quit",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewExportTargetsRendersDeviceMetadata(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenExportTargets
|
||||
m.targets = []platform.RemovableTarget{
|
||||
{
|
||||
Device: "/dev/sdb1",
|
||||
FSType: "vfat",
|
||||
Size: "29G",
|
||||
Label: "BEEUSB",
|
||||
Mountpoint: "/media/bee",
|
||||
},
|
||||
}
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export audit",
|
||||
"Select removable filesystem",
|
||||
"> /dev/sdb1 [vfat 29G] label=BEEUSB mounted=/media/bee",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewStaticFormRendersFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenStaticForm
|
||||
m.selectedIface = "enp1s0"
|
||||
m.formFields = []formField{
|
||||
{Label: "Address", Value: "192.0.2.10/24"},
|
||||
{Label: "Gateway", Value: "192.0.2.1"},
|
||||
{Label: "DNS", Value: "1.1.1.1"},
|
||||
}
|
||||
m.formIndex = 1
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Static IPv4: enp1s0",
|
||||
" Address: 192.0.2.10/24",
|
||||
"> Gateway: 192.0.2.1",
|
||||
" DNS: 1.1.1.1",
|
||||
"[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewConfirmScreenMatchesPendingExport(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.screen = screenConfirm
|
||||
m.pendingAction = actionExportAudit
|
||||
m.selectedTarget = &platform.RemovableTarget{Device: "/dev/sdb1"}
|
||||
|
||||
view := m.View()
|
||||
|
||||
for _, want := range []string{
|
||||
"Export audit",
|
||||
"Copy latest audit JSON to /dev/sdb1?",
|
||||
"> Confirm",
|
||||
" Cancel",
|
||||
} {
|
||||
if !strings.Contains(view, want) {
|
||||
t.Fatalf("view missing %q\nview:\n%s", want, view)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgClearsBusyAndPendingAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
m.busy = true
|
||||
m.busyTitle = "Export audit"
|
||||
m.pendingAction = actionExportAudit
|
||||
m.screen = screenConfirm
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "Export audit", body: "done", back: screenMain})
|
||||
got := next.(model)
|
||||
|
||||
if got.busy {
|
||||
t.Fatal("busy=true want false")
|
||||
}
|
||||
if got.busyTitle != "" {
|
||||
t.Fatalf("busyTitle=%q want empty", got.busyTitle)
|
||||
}
|
||||
if got.pendingAction != actionNone {
|
||||
t.Fatalf("pendingAction=%q want empty", got.pendingAction)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResultMsgErrorWithoutBodyFormatsCleanly(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m := newTestModel()
|
||||
|
||||
next, _ := m.Update(resultMsg{title: "Export audit", err: assertErr("boom"), back: screenMain})
|
||||
got := next.(model)
|
||||
|
||||
if got.body != "ERROR: boom" {
|
||||
t.Fatalf("body=%q want %q", got.body, "ERROR: boom")
|
||||
}
|
||||
}
|
||||
|
||||
type assertErr string
|
||||
|
||||
func (e assertErr) Error() string { return string(e) }
|
||||
@@ -1,119 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
type screen string
|
||||
|
||||
const (
|
||||
screenMain screen = "main"
|
||||
screenNetwork screen = "network"
|
||||
screenInterfacePick screen = "interface_pick"
|
||||
screenServices screen = "services"
|
||||
screenServiceAction screen = "service_action"
|
||||
screenAcceptance screen = "acceptance"
|
||||
screenExportTargets screen = "export_targets"
|
||||
screenOutput screen = "output"
|
||||
screenStaticForm screen = "static_form"
|
||||
screenConfirm screen = "confirm"
|
||||
)
|
||||
|
||||
type actionKind string
|
||||
|
||||
const (
|
||||
actionNone actionKind = ""
|
||||
actionDHCPOne actionKind = "dhcp_one"
|
||||
actionStaticIPv4 actionKind = "static_ipv4"
|
||||
actionExportAudit actionKind = "export_audit"
|
||||
actionRunNvidiaSAT actionKind = "run_nvidia_sat"
|
||||
actionRunMemorySAT actionKind = "run_memory_sat"
|
||||
actionRunStorageSAT actionKind = "run_storage_sat"
|
||||
)
|
||||
|
||||
type model struct {
|
||||
app *app.App
|
||||
runtimeMode runtimeenv.Mode
|
||||
|
||||
screen screen
|
||||
prevScreen screen
|
||||
cursor int
|
||||
busy bool
|
||||
busyTitle string
|
||||
title string
|
||||
body string
|
||||
banner string
|
||||
mainMenu []string
|
||||
networkMenu []string
|
||||
serviceMenu []string
|
||||
|
||||
services []string
|
||||
interfaces []platform.InterfaceInfo
|
||||
targets []platform.RemovableTarget
|
||||
selectedService string
|
||||
selectedIface string
|
||||
selectedTarget *platform.RemovableTarget
|
||||
pendingAction actionKind
|
||||
|
||||
formFields []formField
|
||||
formIndex int
|
||||
}
|
||||
|
||||
type formField struct {
|
||||
Label string
|
||||
Value string
|
||||
}
|
||||
|
||||
func Run(application *app.App, runtimeMode runtimeenv.Mode) error {
|
||||
options := []tea.ProgramOption{}
|
||||
if runtimeMode != runtimeenv.ModeLiveCD {
|
||||
options = append(options, tea.WithAltScreen())
|
||||
}
|
||||
program := tea.NewProgram(newModel(application, runtimeMode), options...)
|
||||
_, err := program.Run()
|
||||
return err
|
||||
}
|
||||
|
||||
func newModel(application *app.App, runtimeMode runtimeenv.Mode) model {
|
||||
return model{
|
||||
app: application,
|
||||
runtimeMode: runtimeMode,
|
||||
screen: screenMain,
|
||||
mainMenu: []string{
|
||||
"Network",
|
||||
"Services",
|
||||
"Acceptance tests",
|
||||
"Run audit",
|
||||
"Export audit",
|
||||
"Check tools",
|
||||
"Show health summary",
|
||||
"Show audit logs",
|
||||
"Exit",
|
||||
},
|
||||
networkMenu: []string{
|
||||
"Show status",
|
||||
"DHCP on all interfaces",
|
||||
"DHCP on one interface",
|
||||
"Set static IPv4",
|
||||
"Back",
|
||||
},
|
||||
serviceMenu: []string{
|
||||
"Status",
|
||||
"Restart",
|
||||
"Start",
|
||||
"Stop",
|
||||
"Back",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (m model) Init() tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
return bannerMsg{text: strings.TrimSpace(m.app.MainBanner())}
|
||||
}
|
||||
}
|
||||
@@ -1,168 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
|
||||
switch msg := msg.(type) {
|
||||
case tea.KeyMsg:
|
||||
if m.busy {
|
||||
switch msg.String() {
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
default:
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
return m.updateKey(msg)
|
||||
case resultMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
m.title = msg.title
|
||||
if msg.err != nil {
|
||||
body := strings.TrimSpace(msg.body)
|
||||
if body == "" {
|
||||
m.body = fmt.Sprintf("ERROR: %v", msg.err)
|
||||
} else {
|
||||
m.body = fmt.Sprintf("%s\n\nERROR: %v", body, msg.err)
|
||||
}
|
||||
} else {
|
||||
m.body = msg.body
|
||||
}
|
||||
m.pendingAction = actionNone
|
||||
if msg.back != "" {
|
||||
m.prevScreen = msg.back
|
||||
} else {
|
||||
m.prevScreen = m.screen
|
||||
}
|
||||
m.screen = screenOutput
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case servicesMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "Services"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.services = msg.services
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case interfacesMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "interfaces"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.interfaces = msg.ifaces
|
||||
m.screen = screenInterfacePick
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case exportTargetsMsg:
|
||||
m.busy = false
|
||||
m.busyTitle = ""
|
||||
if msg.err != nil {
|
||||
m.title = "export"
|
||||
m.body = msg.err.Error()
|
||||
m.prevScreen = screenMain
|
||||
m.screen = screenOutput
|
||||
return m, nil
|
||||
}
|
||||
m.targets = msg.targets
|
||||
m.screen = screenExportTargets
|
||||
m.cursor = 0
|
||||
return m, nil
|
||||
case bannerMsg:
|
||||
m.banner = strings.TrimSpace(msg.text)
|
||||
return m, nil
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateKey(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
||||
switch m.screen {
|
||||
case screenMain:
|
||||
return m.updateMenu(msg, len(m.mainMenu), m.handleMainMenu)
|
||||
case screenNetwork:
|
||||
return m.updateMenu(msg, len(m.networkMenu), m.handleNetworkMenu)
|
||||
case screenServices:
|
||||
return m.updateMenu(msg, len(m.services), m.handleServicesMenu)
|
||||
case screenServiceAction:
|
||||
return m.updateMenu(msg, len(m.serviceMenu), m.handleServiceActionMenu)
|
||||
case screenAcceptance:
|
||||
return m.updateMenu(msg, 4, m.handleAcceptanceMenu)
|
||||
case screenExportTargets:
|
||||
return m.updateMenu(msg, len(m.targets), m.handleExportTargetsMenu)
|
||||
case screenInterfacePick:
|
||||
return m.updateMenu(msg, len(m.interfaces), m.handleInterfacePickMenu)
|
||||
case screenOutput:
|
||||
switch msg.String() {
|
||||
case "esc", "enter", "q":
|
||||
m.screen = m.prevScreen
|
||||
m.body = ""
|
||||
m.title = ""
|
||||
m.pendingAction = actionNone
|
||||
return m, nil
|
||||
case "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
case screenStaticForm:
|
||||
return m.updateStaticForm(msg)
|
||||
case screenConfirm:
|
||||
return m.updateConfirm(msg)
|
||||
}
|
||||
|
||||
if msg.String() == "ctrl+c" {
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m model) updateMenu(msg tea.KeyMsg, size int, onEnter func() (tea.Model, tea.Cmd)) (tea.Model, tea.Cmd) {
|
||||
if size == 0 {
|
||||
size = 1
|
||||
}
|
||||
switch msg.String() {
|
||||
case "up", "k":
|
||||
if m.cursor > 0 {
|
||||
m.cursor--
|
||||
}
|
||||
case "down", "j":
|
||||
if m.cursor < size-1 {
|
||||
m.cursor++
|
||||
}
|
||||
case "enter":
|
||||
return onEnter()
|
||||
case "esc":
|
||||
switch m.screen {
|
||||
case screenNetwork, screenServices, screenAcceptance:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
case screenServiceAction:
|
||||
m.screen = screenServices
|
||||
m.cursor = 0
|
||||
case screenExportTargets:
|
||||
m.screen = screenMain
|
||||
m.cursor = 0
|
||||
case screenInterfacePick:
|
||||
m.screen = screenNetwork
|
||||
m.cursor = 0
|
||||
}
|
||||
case "q", "ctrl+c":
|
||||
return m, tea.Quit
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
@@ -1,169 +0,0 @@
|
||||
package tui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
|
||||
tea "github.com/charmbracelet/bubbletea"
|
||||
)
|
||||
|
||||
func (m model) View() string {
|
||||
if m.busy {
|
||||
title := "bee"
|
||||
if m.busyTitle != "" {
|
||||
title = m.busyTitle
|
||||
}
|
||||
return fmt.Sprintf("%s\n\nWorking...\n\n[ctrl+c] quit\n", title)
|
||||
}
|
||||
switch m.screen {
|
||||
case screenMain:
|
||||
return renderMainMenu("bee", m.banner, "Select action", m.mainMenu, m.cursor)
|
||||
case screenNetwork:
|
||||
return renderMenu("Network", "Select action", m.networkMenu, m.cursor)
|
||||
case screenServices:
|
||||
return renderMenu("Services", "Select service", m.services, m.cursor)
|
||||
case screenServiceAction:
|
||||
items := make([]string, len(m.serviceMenu))
|
||||
copy(items, m.serviceMenu)
|
||||
return renderMenu("Service: "+m.selectedService, "Select action", items, m.cursor)
|
||||
case screenAcceptance:
|
||||
return renderMenu("Acceptance tests", "Select action", []string{"Run NVIDIA command pack", "Run memory test", "Run storage diagnostic pack", "Back"}, m.cursor)
|
||||
case screenExportTargets:
|
||||
return renderMenu("Export audit", "Select removable filesystem", renderTargetItems(m.targets), m.cursor)
|
||||
case screenInterfacePick:
|
||||
return renderMenu("Interfaces", "Select interface", renderInterfaceItems(m.interfaces), m.cursor)
|
||||
case screenStaticForm:
|
||||
return renderForm("Static IPv4: "+m.selectedIface, m.formFields, m.formIndex)
|
||||
case screenConfirm:
|
||||
title, body := m.confirmBody()
|
||||
return renderConfirm(title, body, m.cursor)
|
||||
case screenOutput:
|
||||
return fmt.Sprintf("%s\n\n%s\n\n[enter/esc] back [ctrl+c] quit\n", m.title, strings.TrimSpace(m.body))
|
||||
default:
|
||||
return "bee\n"
|
||||
}
|
||||
}
|
||||
|
||||
func (m model) confirmBody() (string, string) {
|
||||
switch m.pendingAction {
|
||||
case actionExportAudit:
|
||||
if m.selectedTarget == nil {
|
||||
return "Export audit", "No target selected"
|
||||
}
|
||||
return "Export audit", fmt.Sprintf("Copy latest audit JSON to %s?", m.selectedTarget.Device)
|
||||
case actionRunNvidiaSAT:
|
||||
return "NVIDIA SAT", "Run NVIDIA acceptance command pack?"
|
||||
case actionRunMemorySAT:
|
||||
return "Memory SAT", "Run runtime memory test with memtester?"
|
||||
case actionRunStorageSAT:
|
||||
return "Storage SAT", "Run storage diagnostic pack and start short self-tests where supported?"
|
||||
default:
|
||||
return "Confirm", "Proceed?"
|
||||
}
|
||||
}
|
||||
|
||||
func renderTargetItems(targets []platform.RemovableTarget) []string {
|
||||
items := make([]string, 0, len(targets))
|
||||
for _, target := range targets {
|
||||
desc := fmt.Sprintf("%s [%s %s]", target.Device, target.FSType, target.Size)
|
||||
if target.Label != "" {
|
||||
desc += " label=" + target.Label
|
||||
}
|
||||
if target.Mountpoint != "" {
|
||||
desc += " mounted=" + target.Mountpoint
|
||||
}
|
||||
items = append(items, desc)
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func renderInterfaceItems(interfaces []platform.InterfaceInfo) []string {
|
||||
items := make([]string, 0, len(interfaces))
|
||||
for _, iface := range interfaces {
|
||||
label := iface.Name
|
||||
if len(iface.IPv4) > 0 {
|
||||
label += " [" + strings.Join(iface.IPv4, ", ") + "]"
|
||||
}
|
||||
items = append(items, label)
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func renderMenu(title, subtitle string, items []string, cursor int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n%s\n\n", title, subtitle)
|
||||
if len(items) == 0 {
|
||||
body.WriteString("(no items)\n")
|
||||
} else {
|
||||
for i, item := range items {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s\n", prefix, item)
|
||||
}
|
||||
}
|
||||
body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderMainMenu(title, banner, subtitle string, items []string, cursor int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n", title)
|
||||
if banner != "" {
|
||||
body.WriteString(strings.TrimSpace(banner))
|
||||
body.WriteString("\n\n")
|
||||
}
|
||||
body.WriteString(subtitle)
|
||||
body.WriteString("\n\n")
|
||||
if len(items) == 0 {
|
||||
body.WriteString("(no items)\n")
|
||||
} else {
|
||||
for i, item := range items {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s\n", prefix, item)
|
||||
}
|
||||
}
|
||||
body.WriteString("\n[↑/↓] move [enter] select [esc] back [ctrl+c] quit\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderForm(title string, fields []formField, idx int) string {
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s\n\n", title)
|
||||
for i, field := range fields {
|
||||
prefix := " "
|
||||
if i == idx {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&body, "%s%s: %s\n", prefix, field.Label, field.Value)
|
||||
}
|
||||
body.WriteString("\n[tab/↑/↓] move [enter] next/submit [backspace] delete [esc] cancel\n")
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func renderConfirm(title, body string, cursor int) string {
|
||||
options := []string{"Confirm", "Cancel"}
|
||||
var out strings.Builder
|
||||
fmt.Fprintf(&out, "%s\n\n%s\n\n", title, body)
|
||||
for i, option := range options {
|
||||
prefix := " "
|
||||
if i == cursor {
|
||||
prefix = "> "
|
||||
}
|
||||
fmt.Fprintf(&out, "%s%s\n", prefix, option)
|
||||
}
|
||||
out.WriteString("\n[←/→/↑/↓] move [enter] select [esc] cancel\n")
|
||||
return out.String()
|
||||
}
|
||||
|
||||
func resultCmd(title, body string, err error, back screen) tea.Cmd {
|
||||
return func() tea.Msg {
|
||||
return resultMsg{title: title, body: body, err: err, back: back}
|
||||
}
|
||||
}
|
||||
989
audit/internal/webui/api.go
Normal file
989
audit/internal/webui/api.go
Normal file
@@ -0,0 +1,989 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
var jobCounter atomic.Uint64
|
||||
|
||||
func newJobID(prefix string) string {
|
||||
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
||||
}
|
||||
|
||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
func sseWrite(w http.ResponseWriter, event, data string) bool {
|
||||
f, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
if event != "" {
|
||||
fmt.Fprintf(w, "event: %s\n", event)
|
||||
}
|
||||
fmt.Fprintf(w, "data: %s\n\n", data)
|
||||
f.Flush()
|
||||
return true
|
||||
}
|
||||
|
||||
func sseStart(w http.ResponseWriter) bool {
|
||||
_, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
http.Error(w, "streaming not supported", http.StatusInternalServerError)
|
||||
return false
|
||||
}
|
||||
w.Header().Set("Content-Type", "text/event-stream")
|
||||
w.Header().Set("Cache-Control", "no-cache")
|
||||
w.Header().Set("Connection", "keep-alive")
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
return true
|
||||
}
|
||||
|
||||
// streamJob streams lines from a jobState to a SSE response.
|
||||
func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
existing, ch := j.subscribe()
|
||||
for _, line := range existing {
|
||||
sseWrite(w, "", line)
|
||||
}
|
||||
if ch == nil {
|
||||
// Job already finished
|
||||
sseWrite(w, "done", j.err)
|
||||
return
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case line, ok := <-ch:
|
||||
if !ok {
|
||||
sseWrite(w, "done", j.err)
|
||||
return
|
||||
}
|
||||
sseWrite(w, "", line)
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runCmdJob runs an exec.Cmd as a background job, streaming stdout+stderr lines.
|
||||
func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
||||
pr, pw := io.Pipe()
|
||||
cmd.Stdout = pw
|
||||
cmd.Stderr = pw
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
j.finish(err.Error())
|
||||
return
|
||||
}
|
||||
// Lower the CPU scheduling priority of stress/audit subprocesses to nice+10
|
||||
// so the X server and kernel interrupt handling remain responsive under load
|
||||
// (prevents KVM/IPMI graphical console from freezing during GPU stress tests).
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, 10)
|
||||
}
|
||||
|
||||
go func() {
|
||||
scanner := bufio.NewScanner(pr)
|
||||
for scanner.Scan() {
|
||||
// Split on \r to handle progress-bar style output (e.g. \r overwrites)
|
||||
// and strip ANSI escape codes so logs are readable in the browser.
|
||||
parts := strings.Split(scanner.Text(), "\r")
|
||||
for _, part := range parts {
|
||||
line := ansiEscapeRE.ReplaceAllString(part, "")
|
||||
if line != "" {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
err := cmd.Wait()
|
||||
_ = pw.Close()
|
||||
if err != nil {
|
||||
j.finish(err.Error())
|
||||
} else {
|
||||
j.finish("")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Audit ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("audit"),
|
||||
Name: "Audit",
|
||||
Target: "audit",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIAuditStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.URL.Query().Get("job_id")
|
||||
if id == "" {
|
||||
id = r.URL.Query().Get("task_id")
|
||||
}
|
||||
// Try task queue first, then legacy job manager
|
||||
if j, ok := globalQueue.findJob(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
if j, ok := globalJobs.get(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
}
|
||||
|
||||
// ── SAT ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||
t := &Task{
|
||||
ID: newJobID("sat-" + target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
},
|
||||
}
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
t.Name = body.DisplayName
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.URL.Query().Get("job_id")
|
||||
if id == "" {
|
||||
id = r.URL.Query().Get("task_id")
|
||||
}
|
||||
if j, ok := globalQueue.findJob(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
if j, ok := globalJobs.get(id); ok {
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.URL.Query().Get("job_id")
|
||||
if id == "" {
|
||||
id = r.URL.Query().Get("task_id")
|
||||
}
|
||||
if t, ok := globalQueue.findByID(id); ok {
|
||||
globalQueue.mu.Lock()
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
}
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
return
|
||||
}
|
||||
if j, ok := globalJobs.get(id); ok {
|
||||
if j.abort() {
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
} else {
|
||||
writeJSON(w, map[string]string{"status": "not_running"})
|
||||
}
|
||||
return
|
||||
}
|
||||
http.Error(w, "job not found", http.StatusNotFound)
|
||||
}
|
||||
|
||||
// ── Services ──────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIServicesList(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
names, err := h.opts.App.ListBeeServices()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
type serviceInfo struct {
|
||||
Name string `json:"name"`
|
||||
State string `json:"state"`
|
||||
Body string `json:"body"`
|
||||
}
|
||||
result := make([]serviceInfo, 0, len(names))
|
||||
for _, name := range names {
|
||||
state := h.opts.App.ServiceState(name)
|
||||
body, _ := h.opts.App.ServiceStatus(name)
|
||||
result = append(result, serviceInfo{Name: name, State: state, Body: body})
|
||||
}
|
||||
writeJSON(w, result)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Name string `json:"name"`
|
||||
Action string `json:"action"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
var action platform.ServiceAction
|
||||
switch req.Action {
|
||||
case "start":
|
||||
action = platform.ServiceStart
|
||||
case "stop":
|
||||
action = platform.ServiceStop
|
||||
case "restart":
|
||||
action = platform.ServiceRestart
|
||||
default:
|
||||
writeError(w, http.StatusBadRequest, "action must be start|stop|restart")
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
||||
}
|
||||
|
||||
// ── Network ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
ifaces, err := h.opts.App.ListInterfaces()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"interfaces": ifaces,
|
||||
"default_route": h.opts.App.DefaultRoute(),
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkDHCP(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Interface string `json:"interface"`
|
||||
}
|
||||
_ = json.NewDecoder(r.Body).Decode(&req)
|
||||
|
||||
result, err := h.applyPendingNetworkChange(func() (app.ActionResult, error) {
|
||||
if req.Interface == "" || req.Interface == "all" {
|
||||
return h.opts.App.DHCPAllResult()
|
||||
}
|
||||
return h.opts.App.DHCPOneResult(req.Interface)
|
||||
})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"status": "ok",
|
||||
"output": result.Body,
|
||||
"rollback_in": int(netRollbackTimeout.Seconds()),
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkStatic(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Interface string `json:"interface"`
|
||||
Address string `json:"address"`
|
||||
Prefix string `json:"prefix"`
|
||||
Gateway string `json:"gateway"`
|
||||
DNS []string `json:"dns"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
cfg := platform.StaticIPv4Config{
|
||||
Interface: req.Interface,
|
||||
Address: req.Address,
|
||||
Prefix: req.Prefix,
|
||||
Gateway: req.Gateway,
|
||||
DNS: req.DNS,
|
||||
}
|
||||
result, err := h.applyPendingNetworkChange(func() (app.ActionResult, error) {
|
||||
return h.opts.App.SetStaticIPv4Result(cfg)
|
||||
})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"status": "ok",
|
||||
"output": result.Body,
|
||||
"rollback_in": int(netRollbackTimeout.Seconds()),
|
||||
})
|
||||
}
|
||||
|
||||
// ── Export ────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
|
||||
entries, err := listExportFiles(h.opts.ExportDir)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, entries)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) {
|
||||
archive, err := app.BuildSupportBundle(h.opts.ExportDir)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{
|
||||
"status": "ok",
|
||||
"path": archive,
|
||||
"url": "/export/support.tar.gz",
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
targets, err := h.opts.App.ListRemovableTargets()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if targets == nil {
|
||||
targets = []platform.RemovableTarget{}
|
||||
}
|
||||
writeJSON(w, targets)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBAudit(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || target.Device == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ExportLatestAuditResult(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || target.Device == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ExportSupportBundleResult(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
gp := h.opts.App.DetectGPUPresence()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]bool{
|
||||
"nvidia": gp.Nvidia,
|
||||
"amd": gp.AMD,
|
||||
})
|
||||
}
|
||||
|
||||
// ── System ────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
inRAM := h.opts.App.IsLiveMediaInRAM()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
h.installMu.Lock()
|
||||
installRunning := h.installJob != nil && !h.installJob.isDone()
|
||||
h.installMu.Unlock()
|
||||
if installRunning {
|
||||
writeError(w, http.StatusConflict, "install to disk is already running")
|
||||
return
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("install-to-ram"),
|
||||
Name: "Install to RAM",
|
||||
Target: "install-to-ram",
|
||||
Priority: 10,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint", "qrencode",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
statuses := h.opts.App.CheckTools(standardTools)
|
||||
writeJSON(w, statuses)
|
||||
}
|
||||
|
||||
// ── Preflight ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIPreflight(w http.ResponseWriter, r *http.Request) {
|
||||
data, err := loadSnapshot(filepath.Join(h.opts.ExportDir, "runtime-health.json"))
|
||||
if err != nil {
|
||||
writeError(w, http.StatusNotFound, "runtime health not found")
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(data)
|
||||
}
|
||||
|
||||
// ── Install ───────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIInstallDisks(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
disks, err := h.opts.App.ListInstallDisks()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
type diskJSON struct {
|
||||
Device string `json:"device"`
|
||||
Model string `json:"model"`
|
||||
Size string `json:"size"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
MountedParts []string `json:"mounted_parts"`
|
||||
Warnings []string `json:"warnings"`
|
||||
}
|
||||
result := make([]diskJSON, 0, len(disks))
|
||||
for _, d := range disks {
|
||||
result = append(result, diskJSON{
|
||||
Device: d.Device,
|
||||
Model: d.Model,
|
||||
Size: d.Size,
|
||||
SizeBytes: d.SizeBytes,
|
||||
MountedParts: d.MountedParts,
|
||||
Warnings: platform.DiskWarnings(d),
|
||||
})
|
||||
}
|
||||
writeJSON(w, result)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Device == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
|
||||
// Whitelist: only allow devices that ListInstallDisks() returns.
|
||||
disks, err := h.opts.App.ListInstallDisks()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
allowed := false
|
||||
for _, d := range disks {
|
||||
if d.Device == req.Device {
|
||||
allowed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "device not in install candidate list")
|
||||
return
|
||||
}
|
||||
if globalQueue.hasActiveTarget("install-to-ram") {
|
||||
writeError(w, http.StatusConflict, "install to RAM task is already pending or running")
|
||||
return
|
||||
}
|
||||
|
||||
h.installMu.Lock()
|
||||
if h.installJob != nil && !h.installJob.isDone() {
|
||||
h.installMu.Unlock()
|
||||
writeError(w, http.StatusConflict, "install already running")
|
||||
return
|
||||
}
|
||||
j := &jobState{}
|
||||
h.installJob = j
|
||||
h.installMu.Unlock()
|
||||
|
||||
logFile := platform.InstallLogPath(req.Device)
|
||||
go runCmdJob(j, exec.CommandContext(context.Background(), "bee-install", req.Device, logFile))
|
||||
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallStream(w http.ResponseWriter, r *http.Request) {
|
||||
h.installMu.Lock()
|
||||
j := h.installJob
|
||||
h.installMu.Unlock()
|
||||
if j == nil {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
sseWrite(w, "done", "")
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
}
|
||||
|
||||
// ── Metrics SSE ───────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIMetricsLatest(w http.ResponseWriter, r *http.Request) {
|
||||
sample, ok := h.latestMetric()
|
||||
if !ok {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte("{}"))
|
||||
return
|
||||
}
|
||||
b, err := json.Marshal(sample)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write(b)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
ticker := time.NewTicker(1 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
sample, ok := h.latestMetric()
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
b, err := json.Marshal(sample)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if !sseWrite(w, "metrics", string(b)) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// feedRings pushes one sample into all in-memory ring buffers.
|
||||
func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
||||
for _, t := range sample.Temps {
|
||||
switch t.Group {
|
||||
case "cpu":
|
||||
h.pushNamedMetricRing(&h.cpuTempRings, t.Name, t.Celsius)
|
||||
case "ambient":
|
||||
h.pushNamedMetricRing(&h.ambientTempRings, t.Name, t.Celsius)
|
||||
}
|
||||
}
|
||||
h.ringPower.push(sample.PowerW)
|
||||
h.ringCPULoad.push(sample.CPULoadPct)
|
||||
h.ringMemLoad.push(sample.MemLoadPct)
|
||||
|
||||
h.ringsMu.Lock()
|
||||
for i, fan := range sample.Fans {
|
||||
for len(h.ringFans) <= i {
|
||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||
h.fanNames = append(h.fanNames, fan.Name)
|
||||
}
|
||||
h.ringFans[i].push(float64(fan.RPM))
|
||||
}
|
||||
for _, gpu := range sample.GPUs {
|
||||
idx := gpu.GPUIndex
|
||||
for len(h.gpuRings) <= idx {
|
||||
h.gpuRings = append(h.gpuRings, &gpuRings{
|
||||
Temp: newMetricsRing(120),
|
||||
Util: newMetricsRing(120),
|
||||
MemUtil: newMetricsRing(120),
|
||||
Power: newMetricsRing(120),
|
||||
})
|
||||
}
|
||||
h.gpuRings[idx].Temp.push(gpu.TempC)
|
||||
h.gpuRings[idx].Util.push(gpu.UsagePct)
|
||||
h.gpuRings[idx].MemUtil.push(gpu.MemUsagePct)
|
||||
h.gpuRings[idx].Power.push(gpu.PowerW)
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
}
|
||||
|
||||
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
||||
if name == "" {
|
||||
return
|
||||
}
|
||||
for _, item := range *dst {
|
||||
if item != nil && item.Name == name && item.Ring != nil {
|
||||
item.Ring.push(value)
|
||||
return
|
||||
}
|
||||
}
|
||||
*dst = append(*dst, &namedMetricsRing{
|
||||
Name: name,
|
||||
Ring: newMetricsRing(120),
|
||||
})
|
||||
(*dst)[len(*dst)-1].Ring.push(value)
|
||||
}
|
||||
|
||||
// ── Network toggle ────────────────────────────────────────────────────────────
|
||||
|
||||
const netRollbackTimeout = 60 * time.Second
|
||||
|
||||
func (h *handler) handleAPINetworkToggle(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Iface string `json:"iface"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Iface == "" {
|
||||
writeError(w, http.StatusBadRequest, "iface is required")
|
||||
return
|
||||
}
|
||||
|
||||
wasUp, err := h.opts.App.GetInterfaceState(req.Iface)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := h.applyPendingNetworkChange(func() (app.ActionResult, error) {
|
||||
err := h.opts.App.SetInterfaceState(req.Iface, !wasUp)
|
||||
return app.ActionResult{}, err
|
||||
}); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
newState := "up"
|
||||
if wasUp {
|
||||
newState = "down"
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"iface": req.Iface,
|
||||
"new_state": newState,
|
||||
"rollback_in": int(netRollbackTimeout.Seconds()),
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, error)) (app.ActionResult, error) {
|
||||
if h.opts.App == nil {
|
||||
return app.ActionResult{}, fmt.Errorf("app not configured")
|
||||
}
|
||||
|
||||
if err := h.rollbackPendingNetworkChange(); err != nil && err.Error() != "no pending network change" {
|
||||
return app.ActionResult{}, err
|
||||
}
|
||||
|
||||
snapshot, err := h.opts.App.CaptureNetworkSnapshot()
|
||||
if err != nil {
|
||||
return app.ActionResult{}, err
|
||||
}
|
||||
|
||||
result, err := apply()
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
|
||||
pnc := &pendingNetChange{snapshot: snapshot}
|
||||
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
||||
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
||||
h.pendingNetMu.Lock()
|
||||
if h.pendingNet == pnc {
|
||||
h.pendingNet = nil
|
||||
}
|
||||
h.pendingNetMu.Unlock()
|
||||
})
|
||||
|
||||
h.pendingNetMu.Lock()
|
||||
h.pendingNet = pnc
|
||||
h.pendingNetMu.Unlock()
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
h.pendingNet = nil
|
||||
h.pendingNetMu.Unlock()
|
||||
if pnc != nil {
|
||||
pnc.mu.Lock()
|
||||
pnc.timer.Stop()
|
||||
pnc.mu.Unlock()
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "confirmed"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkRollback(w http.ResponseWriter, _ *http.Request) {
|
||||
if err := h.rollbackPendingNetworkChange(); err != nil {
|
||||
if err.Error() == "no pending network change" {
|
||||
writeError(w, http.StatusConflict, err.Error())
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "rolled back"})
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
h.pendingNet = nil
|
||||
h.pendingNetMu.Unlock()
|
||||
if pnc == nil {
|
||||
return fmt.Errorf("no pending network change")
|
||||
}
|
||||
pnc.mu.Lock()
|
||||
pnc.timer.Stop()
|
||||
pnc.mu.Unlock()
|
||||
if h.opts.App != nil {
|
||||
return h.opts.App.RestoreNetworkSnapshot(pnc.snapshot)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Display / Screen Resolution ───────────────────────────────────────────────
|
||||
|
||||
type displayMode struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
Current bool `json:"current"`
|
||||
}
|
||||
|
||||
type displayInfo struct {
|
||||
Output string `json:"output"`
|
||||
Modes []displayMode `json:"modes"`
|
||||
Current string `json:"current"`
|
||||
}
|
||||
|
||||
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
|
||||
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
|
||||
var xrandrCurrentRE = regexp.MustCompile(`\*`)
|
||||
|
||||
func parseXrandrOutput(out string) []displayInfo {
|
||||
var infos []displayInfo
|
||||
var cur *displayInfo
|
||||
for _, line := range strings.Split(out, "\n") {
|
||||
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
cur = &displayInfo{Output: m[1]}
|
||||
continue
|
||||
}
|
||||
if cur == nil {
|
||||
continue
|
||||
}
|
||||
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
|
||||
isCurrent := xrandrCurrentRE.MatchString(line)
|
||||
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
|
||||
cur.Modes = append(cur.Modes, mode)
|
||||
if isCurrent {
|
||||
cur.Current = m[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
return infos
|
||||
}
|
||||
|
||||
func xrandrCommand(args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("xrandr", args...)
|
||||
env := append([]string{}, os.Environ()...)
|
||||
hasDisplay := false
|
||||
hasXAuthority := false
|
||||
for _, kv := range env {
|
||||
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
env = append(env, "DISPLAY=:0")
|
||||
}
|
||||
if !hasXAuthority {
|
||||
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
|
||||
}
|
||||
cmd.Env = env
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
||||
out, err := xrandrCommand().Output()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, parseXrandrOutput(string(out)))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
|
||||
writeError(w, http.StatusBadRequest, "output and mode are required")
|
||||
return
|
||||
}
|
||||
// Validate mode looks like WxH to prevent injection
|
||||
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
|
||||
writeError(w, http.StatusBadRequest, "invalid mode format")
|
||||
return
|
||||
}
|
||||
// Validate output name (no special chars)
|
||||
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
|
||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
||||
return
|
||||
}
|
||||
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
|
||||
}
|
||||
64
audit/internal/webui/api_test.go
Normal file
64
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
)
|
||||
|
||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||
t.Setenv("DISPLAY", "")
|
||||
t.Setenv("XAUTHORITY", "")
|
||||
|
||||
cmd := xrandrCommand("--query")
|
||||
|
||||
var hasDisplay bool
|
||||
var hasXAuthority bool
|
||||
for _, kv := range cmd.Env {
|
||||
if kv == "DISPLAY=:0" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||
}
|
||||
if !hasXAuthority {
|
||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||
req.ContentLength = -1
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
}
|
||||
137
audit/internal/webui/jobs.go
Normal file
137
audit/internal/webui/jobs.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// jobState holds the output lines and completion status of an async job.
|
||||
type jobState struct {
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
func (j *jobState) abort() bool {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
if j.done || j.cancel == nil {
|
||||
return false
|
||||
}
|
||||
j.cancel()
|
||||
return true
|
||||
}
|
||||
|
||||
func (j *jobState) append(line string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
appendJobLog(j.logPath, line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
select {
|
||||
case ch <- line:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (j *jobState) finish(errMsg string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.done = true
|
||||
j.err = errMsg
|
||||
for _, ch := range j.subs {
|
||||
close(ch)
|
||||
}
|
||||
j.subs = nil
|
||||
}
|
||||
|
||||
// subscribe returns a channel that receives all future lines.
|
||||
// Existing lines are returned first, then the channel streams new ones.
|
||||
func (j *jobState) subscribe() ([]string, <-chan string) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
existing := make([]string, len(j.lines))
|
||||
copy(existing, j.lines)
|
||||
if j.done {
|
||||
return existing, nil
|
||||
}
|
||||
ch := make(chan string, 256)
|
||||
j.subs = append(j.subs, ch)
|
||||
return existing, ch
|
||||
}
|
||||
|
||||
// jobManager manages async jobs identified by string IDs.
|
||||
type jobManager struct {
|
||||
mu sync.Mutex
|
||||
jobs map[string]*jobState
|
||||
}
|
||||
|
||||
var globalJobs = &jobManager{jobs: make(map[string]*jobState)}
|
||||
|
||||
func (m *jobManager) create(id string) *jobState {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
j := &jobState{}
|
||||
m.jobs[id] = j
|
||||
// Schedule cleanup after 30 minutes
|
||||
go func() {
|
||||
time.Sleep(30 * time.Minute)
|
||||
m.mu.Lock()
|
||||
delete(m.jobs, id)
|
||||
m.mu.Unlock()
|
||||
}()
|
||||
return j
|
||||
}
|
||||
|
||||
// isDone returns true if the job has finished (either successfully or with error).
|
||||
func (j *jobState) isDone() bool {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
return j.done
|
||||
}
|
||||
|
||||
func (m *jobManager) get(id string) (*jobState, bool) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
j, ok := m.jobs[id]
|
||||
return j, ok
|
||||
}
|
||||
|
||||
func newTaskJobState(logPath string) *jobState {
|
||||
j := &jobState{logPath: logPath}
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
data, err := os.ReadFile(logPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return j
|
||||
}
|
||||
lines := strings.Split(strings.ReplaceAll(string(data), "\r\n", "\n"), "\n")
|
||||
if len(lines) > 0 && lines[len(lines)-1] == "" {
|
||||
lines = lines[:len(lines)-1]
|
||||
}
|
||||
j.lines = append(j.lines, lines...)
|
||||
return j
|
||||
}
|
||||
|
||||
func appendJobLog(path, line string) {
|
||||
if path == "" {
|
||||
return
|
||||
}
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
_, _ = f.WriteString(line + "\n")
|
||||
}
|
||||
317
audit/internal/webui/metricsdb.go
Normal file
317
audit/internal/webui/metricsdb.go
Normal file
@@ -0,0 +1,317 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||
|
||||
// MetricsDB persists live metric samples to SQLite.
|
||||
type MetricsDB struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db.SetMaxOpenConns(1)
|
||||
if err := initMetricsSchema(db); err != nil {
|
||||
_ = db.Close()
|
||||
return nil, err
|
||||
}
|
||||
return &MetricsDB{db: db}, nil
|
||||
}
|
||||
|
||||
func initMetricsSchema(db *sql.DB) error {
|
||||
_, err := db.Exec(`
|
||||
CREATE TABLE IF NOT EXISTS sys_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
gpu_index INTEGER NOT NULL,
|
||||
temp_c REAL,
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
rpm REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
grp TEXT NOT NULL,
|
||||
celsius REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
return err
|
||||
}
|
||||
|
||||
// Write inserts one sample into all relevant tables.
|
||||
func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
ts := s.Timestamp.Unix()
|
||||
tx, err := m.db.Begin()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, f := range s.Fans {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO fan_metrics(ts,name,rpm) VALUES(?,?,?)`,
|
||||
ts, f.Name, f.RPM,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO temp_metrics(ts,name,grp,celsius) VALUES(?,?,?,?)`,
|
||||
ts, t.Name, t.Group, t.Celsius,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
}
|
||||
var sysRows []sysRow
|
||||
for rows.Next() {
|
||||
var r sysRow
|
||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
||||
continue
|
||||
}
|
||||
sysRows = append(sysRows, r)
|
||||
}
|
||||
if len(sysRows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
// Reverse to chronological order
|
||||
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
||||
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
||||
}
|
||||
|
||||
// Collect min/max ts for range query
|
||||
minTS := sysRows[0].ts
|
||||
maxTS := sysRows[len(sysRows)-1].ts
|
||||
|
||||
// Load GPU rows in range
|
||||
type gpuKey struct{ ts int64; idx int }
|
||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||
gRows, err := m.db.Query(
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer gRows.Close()
|
||||
for gRows.Next() {
|
||||
var ts int64
|
||||
var g platform.GPUMetricRow
|
||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
||||
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load fan rows in range
|
||||
type fanKey struct{ ts int64; name string }
|
||||
fanData := map[fanKey]float64{}
|
||||
fRows, err := m.db.Query(
|
||||
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer fRows.Close()
|
||||
for fRows.Next() {
|
||||
var ts int64
|
||||
var name string
|
||||
var rpm float64
|
||||
if err := fRows.Scan(&ts, &name, &rpm); err == nil {
|
||||
fanData[fanKey{ts, name}] = rpm
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load temp rows in range
|
||||
type tempKey struct{ ts int64; name string }
|
||||
tempData := map[tempKey]platform.TempReading{}
|
||||
tRows, err := m.db.Query(
|
||||
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
defer tRows.Close()
|
||||
for tRows.Next() {
|
||||
var ts int64
|
||||
var t platform.TempReading
|
||||
if err := tRows.Scan(&ts, &t.Name, &t.Group, &t.Celsius); err == nil {
|
||||
tempData[tempKey{ts, t.Name}] = t
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
if !seenGPU[k.idx] {
|
||||
seenGPU[k.idx] = true
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
if !seenFan[k.name] {
|
||||
seenFan[k.name] = true
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
if !seenTemp[k.name] {
|
||||
seenTemp[k.name] = true
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
s := platform.LiveMetricSample{
|
||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||
CPULoadPct: r.cpu,
|
||||
MemLoadPct: r.mem,
|
||||
PowerW: r.pwr,
|
||||
}
|
||||
for _, idx := range gpuIndices {
|
||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||
s.GPUs = append(s.GPUs, g)
|
||||
}
|
||||
}
|
||||
for _, name := range fanNames {
|
||||
if rpm, ok := fanData[fanKey{r.ts, name}]; ok {
|
||||
s.Fans = append(s.Fans, platform.FanReading{Name: name, RPM: rpm})
|
||||
}
|
||||
}
|
||||
for _, name := range tempNames {
|
||||
if t, ok := tempData[tempKey{r.ts, name}]; ok {
|
||||
s.Temps = append(s.Temps, t)
|
||||
}
|
||||
}
|
||||
samples[i] = s
|
||||
}
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
rows, err := m.db.Query(`
|
||||
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
||||
FROM sys_metrics s
|
||||
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||
ORDER BY s.ts, g.gpu_index
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
cw := csv.NewWriter(w)
|
||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
||||
for rows.Next() {
|
||||
var ts int64
|
||||
var cpu, mem, pwr float64
|
||||
var gpuIdx sql.NullInt64
|
||||
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
||||
continue
|
||||
}
|
||||
row := []string{
|
||||
strconv.FormatInt(ts, 10),
|
||||
strconv.FormatFloat(cpu, 'f', 2, 64),
|
||||
strconv.FormatFloat(mem, 'f', 2, 64),
|
||||
strconv.FormatFloat(pwr, 'f', 1, 64),
|
||||
}
|
||||
if gpuIdx.Valid {
|
||||
row = append(row,
|
||||
strconv.FormatInt(gpuIdx.Int64, 10),
|
||||
strconv.FormatFloat(gpuTemp.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||
)
|
||||
} else {
|
||||
row = append(row, "", "", "", "", "")
|
||||
}
|
||||
_ = cw.Write(row)
|
||||
}
|
||||
cw.Flush()
|
||||
return cw.Error()
|
||||
}
|
||||
|
||||
// Close closes the database.
|
||||
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
||||
|
||||
func nullFloat(v float64) sql.NullFloat64 {
|
||||
return sql.NullFloat64{Float64: v, Valid: true}
|
||||
}
|
||||
1462
audit/internal/webui/pages.go
Normal file
1462
audit/internal/webui/pages.go
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -7,11 +7,95 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestRootRendersLatestSnapshot(t *testing.T) {
|
||||
func TestChartLegendNumber(t *testing.T) {
|
||||
tests := []struct {
|
||||
in float64
|
||||
want string
|
||||
}{
|
||||
{in: 0.4, want: "0"},
|
||||
{in: 61.5, want: "62"},
|
||||
{in: 999.4, want: "999"},
|
||||
{in: 1200, want: "1,2k"},
|
||||
{in: 1250, want: "1,25k"},
|
||||
{in: 1310, want: "1,31k"},
|
||||
{in: 1500, want: "1,5k"},
|
||||
{in: 2600, want: "2,6k"},
|
||||
{in: 10200, want: "10k"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := chartLegendNumber(tc.in); got != tc.want {
|
||||
t.Fatalf("chartLegendNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||
CPULoadPct: 10,
|
||||
MemLoadPct: 20,
|
||||
PowerW: 300,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
CPULoadPct: 30,
|
||||
MemLoadPct: 40,
|
||||
PowerW: 320,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
CPULoadPct: 50,
|
||||
MemLoadPct: 60,
|
||||
PowerW: 340,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if len(names) != 1 || names[0] != "GPU 0" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != len(samples) {
|
||||
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||
}
|
||||
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||
t.Fatalf("datasets shape=%v", datasets)
|
||||
}
|
||||
if got := datasets[0][0]; got != 120 {
|
||||
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||
}
|
||||
if got := datasets[0][2]; got != 130 {
|
||||
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-OLD"}}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -19,6 +103,7 @@ func TestRootRendersLatestSnapshot(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
first := httptest.NewRecorder()
|
||||
@@ -26,8 +111,12 @@ func TestRootRendersLatestSnapshot(t *testing.T) {
|
||||
if first.Code != http.StatusOK {
|
||||
t.Fatalf("first status=%d", first.Code)
|
||||
}
|
||||
if !strings.Contains(first.Body.String(), "SERIAL-OLD") {
|
||||
t.Fatalf("first body missing old serial: %s", first.Body.String())
|
||||
// Dashboard should contain the audit nav link and hardware summary
|
||||
if !strings.Contains(first.Body.String(), `href="/audit"`) {
|
||||
t.Fatalf("first body missing audit nav link: %s", first.Body.String())
|
||||
}
|
||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||
}
|
||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||
t.Fatalf("first cache-control=%q", got)
|
||||
@@ -42,11 +131,91 @@ func TestRootRendersLatestSnapshot(t *testing.T) {
|
||||
if second.Code != http.StatusOK {
|
||||
t.Fatalf("second status=%d", second.Code)
|
||||
}
|
||||
if !strings.Contains(second.Body.String(), `Hardware Summary`) {
|
||||
t.Fatalf("second body missing hardware summary: %s", second.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: filepath.Join(dir, "missing-audit.json"),
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Run Audit`) {
|
||||
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||
}
|
||||
if strings.Contains(body, `No audit data`) {
|
||||
t.Fatalf("dashboard still shows empty audit badge: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `iframe class="viewer-frame" src="/viewer"`) {
|
||||
t.Fatalf("audit page missing viewer frame: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `openAuditModal()`) {
|
||||
t.Fatalf("audit page missing action modal trigger: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-OLD"}}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
first := httptest.NewRecorder()
|
||||
handler.ServeHTTP(first, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||
if first.Code != http.StatusOK {
|
||||
t.Fatalf("first status=%d", first.Code)
|
||||
}
|
||||
if !strings.Contains(first.Body.String(), "SERIAL-OLD") {
|
||||
t.Fatalf("viewer body missing old serial: %s", first.Body.String())
|
||||
}
|
||||
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:05:00Z","hardware":{"board":{"serial_number":"SERIAL-NEW"}}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
second := httptest.NewRecorder()
|
||||
handler.ServeHTTP(second, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||
if second.Code != http.StatusOK {
|
||||
t.Fatalf("second status=%d", second.Code)
|
||||
}
|
||||
if !strings.Contains(second.Body.String(), "SERIAL-NEW") {
|
||||
t.Fatalf("second body missing new serial: %s", second.Body.String())
|
||||
t.Fatalf("viewer body missing new serial: %s", second.Body.String())
|
||||
}
|
||||
if strings.Contains(second.Body.String(), "SERIAL-OLD") {
|
||||
t.Fatalf("second body still contains old serial: %s", second.Body.String())
|
||||
t.Fatalf("viewer body still contains old serial: %s", second.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,8 +233,8 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if got := strings.TrimSpace(rec.Body.String()); got != body {
|
||||
t.Fatalf("body=%q want %q", got, body)
|
||||
if !strings.Contains(rec.Body.String(), "SERIAL-API") {
|
||||
t.Fatalf("body missing expected serial: %s", rec.Body.String())
|
||||
}
|
||||
if got := rec.Header().Get("Content-Type"); !strings.Contains(got, "application/json") {
|
||||
t.Fatalf("content-type=%q", got)
|
||||
@@ -80,3 +249,49 @@ func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
|
||||
t.Fatalf("status=%d want %d", rec.Code, http.StatusNotFound)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/export/support.tar.gz", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if got := rec.Header().Get("Content-Disposition"); !strings.Contains(got, "attachment;") {
|
||||
t.Fatalf("content-disposition=%q", got)
|
||||
}
|
||||
if rec.Body.Len() == 0 {
|
||||
t.Fatal("empty archive body")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
body := `{"status":"PARTIAL","checked_at":"2026-03-16T10:00:00Z"}`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(body), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/runtime-health.json", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if strings.TrimSpace(rec.Body.String()) != body {
|
||||
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
||||
}
|
||||
}
|
||||
|
||||
735
audit/internal/webui/tasks.go
Normal file
735
audit/internal/webui/tasks.go
Normal file
@@ -0,0 +1,735 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// Task statuses.
|
||||
const (
|
||||
TaskPending = "pending"
|
||||
TaskRunning = "running"
|
||||
TaskDone = "done"
|
||||
TaskFailed = "failed"
|
||||
TaskCancelled = "cancelled"
|
||||
)
|
||||
|
||||
// taskNames maps target → human-readable name for validate (SAT) runs.
|
||||
var taskNames = map[string]string{
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"nvidia-stress": "NVIDIA GPU Stress",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"platform-stress": "Platform Thermal Cycling",
|
||||
"audit": "Audit",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
var burnNames = map[string]string{
|
||||
"nvidia": "NVIDIA Burn-in",
|
||||
"memory": "Memory Burn-in",
|
||||
"cpu": "CPU Burn-in",
|
||||
"amd": "AMD GPU Burn-in",
|
||||
}
|
||||
|
||||
func nvidiaStressTaskName(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case platform.NvidiaStressLoaderJohn:
|
||||
return "NVIDIA GPU Stress (John/OpenCL)"
|
||||
case platform.NvidiaStressLoaderNCCL:
|
||||
return "NVIDIA GPU Stress (NCCL)"
|
||||
default:
|
||||
return "NVIDIA GPU Stress (bee-gpu-burn)"
|
||||
}
|
||||
}
|
||||
|
||||
func taskDisplayName(target, profile, loader string) string {
|
||||
name := taskNames[target]
|
||||
if profile != "" {
|
||||
if n, ok := burnNames[target]; ok {
|
||||
name = n
|
||||
}
|
||||
}
|
||||
if target == "nvidia-stress" {
|
||||
name = nvidiaStressTaskName(loader)
|
||||
}
|
||||
if name == "" {
|
||||
name = target
|
||||
}
|
||||
return name
|
||||
}
|
||||
|
||||
// Task represents one unit of work in the queue.
|
||||
type Task struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
params taskParams
|
||||
}
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
type taskParams struct {
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
Params taskParams `json:"params,omitempty"`
|
||||
}
|
||||
|
||||
type burnPreset struct {
|
||||
NvidiaDiag int
|
||||
DurationSec int
|
||||
}
|
||||
|
||||
func resolveBurnPreset(profile string) burnPreset {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
|
||||
case "acceptance":
|
||||
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
|
||||
default:
|
||||
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
|
||||
}
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
}}
|
||||
case "acceptance":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
}}
|
||||
default: // smoke
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 90, IdleSec: 60},
|
||||
{LoadSec: 90, IdleSec: 30},
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
}
|
||||
|
||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
|
||||
const maxTaskHistory = 50
|
||||
|
||||
var (
|
||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runNvidiaStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
runMemoryStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
runSATStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
)
|
||||
|
||||
// enqueue adds a task to the queue and notifies the worker.
|
||||
func (q *taskQueue) enqueue(t *Task) {
|
||||
q.mu.Lock()
|
||||
q.assignTaskLogPathLocked(t)
|
||||
q.tasks = append(q.tasks, t)
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// prune removes oldest completed tasks beyond maxTaskHistory.
|
||||
func (q *taskQueue) prune() {
|
||||
var done []*Task
|
||||
var active []*Task
|
||||
for _, t := range q.tasks {
|
||||
switch t.Status {
|
||||
case TaskDone, TaskFailed, TaskCancelled:
|
||||
done = append(done, t)
|
||||
default:
|
||||
active = append(active, t)
|
||||
}
|
||||
}
|
||||
if len(done) > maxTaskHistory {
|
||||
done = done[len(done)-maxTaskHistory:]
|
||||
}
|
||||
q.tasks = append(active, done...)
|
||||
}
|
||||
|
||||
// nextPending returns the highest-priority pending task (nil if none).
|
||||
func (q *taskQueue) nextPending() *Task {
|
||||
var best *Task
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskPending {
|
||||
continue
|
||||
}
|
||||
if best == nil || t.Priority > best.Priority ||
|
||||
(t.Priority == best.Priority && t.CreatedAt.Before(best.CreatedAt)) {
|
||||
best = t
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
// findByID looks up a task by ID.
|
||||
func (q *taskQueue) findByID(id string) (*Task, bool) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.ID == id {
|
||||
return t, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// findJob returns the jobState for a task ID (for SSE streaming compatibility).
|
||||
func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
||||
t, ok := q.findByID(id)
|
||||
if !ok || t.job == nil {
|
||||
return nil, false
|
||||
}
|
||||
return t.job, true
|
||||
}
|
||||
|
||||
func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.Target != target {
|
||||
continue
|
||||
}
|
||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
||||
func (q *taskQueue) snapshot() []Task {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
out := make([]Task, len(q.tasks))
|
||||
for i, t := range q.tasks {
|
||||
out[i] = *t
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
si := statusOrder(out[i].Status)
|
||||
sj := statusOrder(out[j].Status)
|
||||
if si != sj {
|
||||
return si < sj
|
||||
}
|
||||
if out[i].Priority != out[j].Priority {
|
||||
return out[i].Priority > out[j].Priority
|
||||
}
|
||||
return out[i].CreatedAt.Before(out[j].CreatedAt)
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
func statusOrder(s string) int {
|
||||
switch s {
|
||||
case TaskRunning:
|
||||
return 0
|
||||
case TaskPending:
|
||||
return 1
|
||||
default:
|
||||
return 2
|
||||
}
|
||||
}
|
||||
|
||||
// startWorker launches the queue runner goroutine.
|
||||
func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
q.mu.Lock()
|
||||
q.opts = opts
|
||||
q.statePath = filepath.Join(opts.ExportDir, "tasks-state.json")
|
||||
q.logsDir = filepath.Join(opts.ExportDir, "tasks")
|
||||
_ = os.MkdirAll(q.logsDir, 0755)
|
||||
if !q.started {
|
||||
q.loadLocked()
|
||||
q.started = true
|
||||
go q.worker()
|
||||
}
|
||||
hasPending := q.nextPending() != nil
|
||||
q.mu.Unlock()
|
||||
if hasPending {
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) worker() {
|
||||
for {
|
||||
<-q.trigger
|
||||
setCPUGovernor("performance")
|
||||
for {
|
||||
q.mu.Lock()
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
q.mu.Unlock()
|
||||
break
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskRunning
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
j.cancel = cancel
|
||||
t.job = j
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
q.runTask(t, j, ctx)
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
if t.Status == TaskRunning { // not cancelled externally
|
||||
if j.err != "" {
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = j.err
|
||||
} else {
|
||||
t.Status = TaskDone
|
||||
}
|
||||
}
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
}
|
||||
setCPUGovernor("powersave")
|
||||
}
|
||||
}
|
||||
|
||||
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
||||
// Silently ignores errors (e.g. when cpufreq is not available).
|
||||
func setCPUGovernor(governor string) {
|
||||
matches, err := filepath.Glob("/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor")
|
||||
if err != nil || len(matches) == 0 {
|
||||
return
|
||||
}
|
||||
for _, path := range matches {
|
||||
_ = os.WriteFile(path, []byte(governor), 0644)
|
||||
}
|
||||
}
|
||||
|
||||
// runTask executes the work for a task, writing output to j.
|
||||
func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
j.append("ERROR: app not configured")
|
||||
j.finish("app not configured")
|
||||
return
|
||||
}
|
||||
a := q.opts.App
|
||||
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if len(j.lines) > 0 {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
|
||||
switch t.Target {
|
||||
case "nvidia":
|
||||
diagLevel := t.params.DiagLevel
|
||||
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
||||
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
||||
}
|
||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||
)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
archive = result.Body
|
||||
}
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "nvidia-stress":
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "storage":
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "cpu":
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "memory-stress":
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "sat-stress":
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||
case "audit":
|
||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
for _, line := range splitLines(result.Body) {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
case "install-to-ram":
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
return
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
j.finish("aborted")
|
||||
} else {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
if archive != "" {
|
||||
j.append("Archive: " + archive)
|
||||
}
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func splitLines(s string) []string {
|
||||
var out []string
|
||||
for _, l := range splitNL(s) {
|
||||
if l != "" {
|
||||
out = append(out, l)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func splitNL(s string) []string {
|
||||
var out []string
|
||||
start := 0
|
||||
for i, c := range s {
|
||||
if c == '\n' {
|
||||
out = append(out, s[start:i])
|
||||
start = i + 1
|
||||
}
|
||||
}
|
||||
out = append(out, s[start:])
|
||||
return out
|
||||
}
|
||||
|
||||
// ── HTTP handlers ─────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPITasksList(w http.ResponseWriter, _ *http.Request) {
|
||||
tasks := globalQueue.snapshot()
|
||||
writeJSON(w, tasks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
t, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
writeError(w, http.StatusNotFound, "task not found")
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
default:
|
||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksPriority(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
t, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
writeError(w, http.StatusNotFound, "task not found")
|
||||
return
|
||||
}
|
||||
var req struct {
|
||||
Delta int `json:"delta"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid body")
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if t.Status != TaskPending {
|
||||
writeError(w, http.StatusConflict, "only pending tasks can be reprioritised")
|
||||
return
|
||||
}
|
||||
t.Priority += req.Delta
|
||||
globalQueue.persistLocked()
|
||||
writeJSON(w, map[string]int{"priority": t.Priority})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request) {
|
||||
globalQueue.mu.Lock()
|
||||
now := time.Now()
|
||||
n := 0
|
||||
for _, t := range globalQueue.tasks {
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
n++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
n++
|
||||
}
|
||||
}
|
||||
globalQueue.persistLocked()
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]int{"cancelled": n})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
// Wait up to 5s for the task to get a job (it may be pending)
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
var j *jobState
|
||||
for time.Now().Before(deadline) {
|
||||
if jj, ok := globalQueue.findJob(id); ok {
|
||||
j = jj
|
||||
break
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
if j == nil {
|
||||
http.Error(w, "task not found or not yet started", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
}
|
||||
|
||||
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||
if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
|
||||
return
|
||||
}
|
||||
t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
|
||||
}
|
||||
|
||||
func (q *taskQueue) loadLocked() {
|
||||
if q.statePath == "" {
|
||||
return
|
||||
}
|
||||
data, err := os.ReadFile(q.statePath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return
|
||||
}
|
||||
var persisted []persistedTask
|
||||
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||
return
|
||||
}
|
||||
for _, pt := range persisted {
|
||||
t := &Task{
|
||||
ID: pt.ID,
|
||||
Name: pt.Name,
|
||||
Target: pt.Target,
|
||||
Priority: pt.Priority,
|
||||
Status: pt.Status,
|
||||
CreatedAt: pt.CreatedAt,
|
||||
StartedAt: pt.StartedAt,
|
||||
DoneAt: pt.DoneAt,
|
||||
ErrMsg: pt.ErrMsg,
|
||||
LogPath: pt.LogPath,
|
||||
params: pt.Params,
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||
t.Status = TaskPending
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
q.tasks = append(q.tasks, t)
|
||||
}
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
}
|
||||
|
||||
func (q *taskQueue) persistLocked() {
|
||||
if q.statePath == "" {
|
||||
return
|
||||
}
|
||||
state := make([]persistedTask, 0, len(q.tasks))
|
||||
for _, t := range q.tasks {
|
||||
state = append(state, persistedTask{
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Priority: t.Priority,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
ErrMsg: t.ErrMsg,
|
||||
LogPath: t.LogPath,
|
||||
Params: t.params,
|
||||
})
|
||||
}
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tmp := q.statePath + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return
|
||||
}
|
||||
_ = os.Rename(tmp, q.statePath)
|
||||
}
|
||||
204
audit/internal/webui/tasks_test.go
Normal file
204
audit/internal/webui/tasks_test.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
)
|
||||
|
||||
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
started := time.Now().Add(-time.Minute)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
Name: "Memory Burn-in",
|
||||
Target: "memory-stress",
|
||||
Priority: 2,
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{
|
||||
Duration: 300,
|
||||
BurnProfile: "smoke",
|
||||
},
|
||||
}
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
q.persistLocked()
|
||||
|
||||
recovered := &taskQueue{
|
||||
statePath: q.statePath,
|
||||
logsDir: q.logsDir,
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
recovered.loadLocked()
|
||||
|
||||
if len(recovered.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(recovered.tasks))
|
||||
}
|
||||
got := recovered.tasks[0]
|
||||
if got.Status != TaskPending {
|
||||
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||
}
|
||||
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("params=%+v", got.params)
|
||||
}
|
||||
if got.LogPath == "" {
|
||||
t.Fatal("expected log path")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(path, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
j := newTaskJobState(path)
|
||||
existing, ch := j.subscribe()
|
||||
if ch == nil {
|
||||
t.Fatal("expected live subscription channel")
|
||||
}
|
||||
if len(existing) != 2 || existing[0] != "line1" || existing[1] != "line2" {
|
||||
t.Fatalf("existing=%v", existing)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
want burnPreset
|
||||
}{
|
||||
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
||||
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
||||
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||
t.Fatalf("resolveBurnPreset(%q)=%+v want %+v", tc.profile, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
|
||||
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
|
||||
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
blocked := make(chan struct{})
|
||||
released := make(chan struct{})
|
||||
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
close(blocked)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
close(released)
|
||||
return "", ctx.Err()
|
||||
case <-time.After(5 * time.Second):
|
||||
close(released)
|
||||
return "unexpected", nil
|
||||
}
|
||||
}
|
||||
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{Duration: 60},
|
||||
}
|
||||
j := &jobState{}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
j.cancel = cancel
|
||||
tk.job = j
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return aRun(nil, ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
q.runTask(tk, j, ctx)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
<-blocked
|
||||
j.abort()
|
||||
|
||||
select {
|
||||
case <-released:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("task did not observe cancel")
|
||||
}
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("runTask did not return after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotDuration int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-burn-1",
|
||||
Name: "CPU Burn-in",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{BurnProfile: "smoke"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
|
||||
gotDuration = durationSec
|
||||
return "/tmp/cpu-burn.tar.gz", nil
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDuration != 5*60 {
|
||||
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||
}
|
||||
}
|
||||
2
bible
2
bible
Submodule bible updated: 456c1f022c...688b87e98d
38
bible-local/architecture/charting.md
Normal file
38
bible-local/architecture/charting.md
Normal file
@@ -0,0 +1,38 @@
|
||||
# Charting architecture
|
||||
|
||||
## Decision: one chart engine for all live metrics
|
||||
|
||||
**Engine:** `github.com/go-analyze/charts` (pure Go, no CGO, SVG output)
|
||||
**Theme:** `grafana` (dark background, coloured lines)
|
||||
|
||||
All live metrics charts in the web UI are server-side SVG images served by Go
|
||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||
There is no client-side canvas or JS chart library.
|
||||
|
||||
### Why go-analyze/charts
|
||||
|
||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||
- SVG output — crisp at any display resolution, full-width without pixelation
|
||||
- Grafana theme matches the dark web UI colour scheme
|
||||
- Active fork of the archived wcharczuk/go-chart
|
||||
|
||||
### SAT stress-test charts
|
||||
|
||||
The `drawGPUChartSVG` function in `platform/gpu_metrics.go` is a separate
|
||||
self-contained SVG renderer used **only** for completed SAT run reports
|
||||
(HTML export, burn-in summaries). It is not used for live metrics.
|
||||
|
||||
### Live metrics chart endpoints
|
||||
|
||||
| Path | Content |
|
||||
|------|---------|
|
||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||
|
||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
||||
single-column layout so they always fill the viewport width.
|
||||
|
||||
### Ring buffers
|
||||
|
||||
Each metric is stored in a 120-sample ring buffer (2 minutes of history at 1 Hz).
|
||||
Buffers are per-server or per-GPU and grow dynamically as new GPUs appear.
|
||||
@@ -9,6 +9,8 @@ DHCP is used only for LAN (operator SSH access). Internet is NOT available.
|
||||
|
||||
## Boot sequence (single ISO)
|
||||
|
||||
The live system is expected to boot with `toram`, so `live-boot` copies the full read-only medium into RAM before mounting the root filesystem. After that point, runtime must not depend on the original USB/BMC virtual media staying readable.
|
||||
|
||||
`systemd` boot order:
|
||||
|
||||
```
|
||||
@@ -20,11 +22,12 @@ local-fs.target
|
||||
│ creates /dev/nvidia* nodes)
|
||||
├── bee-audit.service (runs `bee audit` → /var/log/bee-audit.json,
|
||||
│ never blocks boot on partial collector failures)
|
||||
└── bee-web.service (runs `bee web` on :80,
|
||||
reads the latest audit snapshot on each request)
|
||||
├── bee-web.service (runs `bee web` on :80 — full interactive web UI)
|
||||
└── bee-desktop.service (startx → openbox + chromium http://localhost/)
|
||||
```
|
||||
|
||||
**Critical invariants:**
|
||||
- The live ISO boots with `boot=live toram`. Runtime binaries must continue working even if the original boot media disappears after early boot.
|
||||
- OpenSSH MUST start without network. `bee-sshsetup.service` runs before `ssh.service`.
|
||||
- `bee-network.service` uses `dhclient -nw` (background) — network bring-up is best effort and non-blocking.
|
||||
- `bee-nvidia.service` loads modules via `insmod` with absolute paths — NOT `modprobe`.
|
||||
@@ -41,23 +44,27 @@ Local-console behavior:
|
||||
```text
|
||||
tty1
|
||||
└── live-config autologin → bee
|
||||
└── /home/bee/.profile
|
||||
└── exec menu
|
||||
└── /usr/local/bin/bee-tui
|
||||
└── sudo -n /usr/local/bin/bee tui --runtime livecd
|
||||
└── /home/bee/.profile (prints web UI URLs)
|
||||
|
||||
display :0
|
||||
└── bee-desktop.service (User=bee)
|
||||
└── startx /usr/local/bin/bee-openbox-session -- :0
|
||||
├── tint2 (taskbar)
|
||||
├── chromium http://localhost/
|
||||
└── openbox (WM)
|
||||
```
|
||||
|
||||
Rules:
|
||||
- local `tty1` lands in user `bee`, not directly in `root`
|
||||
- `menu` must work without typing `sudo`
|
||||
- TUI actions still run as `root` via `sudo -n`
|
||||
- SSH is independent from the tty1 path
|
||||
- `bee-desktop.service` starts X11 + openbox + Chromium automatically after `bee-web.service`
|
||||
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||
- SSH is independent from the desktop path
|
||||
- serial console support is enabled for VM boot debugging
|
||||
|
||||
## ISO build sequence
|
||||
|
||||
```
|
||||
build.sh [--authorized-keys /path/to/keys]
|
||||
build-in-container.sh [--authorized-keys /path/to/keys]
|
||||
1. compile `bee` binary (skip if .go files older than binary)
|
||||
2. create a temporary overlay staging dir under `dist/`
|
||||
3. inject authorized_keys into staged `root/.ssh/` (or set password fallback marker)
|
||||
@@ -71,25 +78,39 @@ build.sh [--authorized-keys /path/to/keys]
|
||||
d. build kernel modules against Debian headers
|
||||
e. create `libnvidia-ml.so.1` / `libcuda.so.1` symlinks in cache
|
||||
f. cache in `dist/nvidia-<version>-<kver>/`
|
||||
7. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||
8. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||
9. inject `libnvidia-ml` + `libcuda` → staged `/usr/lib/`
|
||||
10. write staged `/etc/bee-release` (versions + git commit)
|
||||
11. patch staged `motd` with build metadata
|
||||
12. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||
13. sync staged overlay into workdir `config/includes.chroot/`
|
||||
14. run `lb config && lb build` inside the temporary workdir
|
||||
(either on a Debian host/VM or inside the privileged builder container)
|
||||
7. `build-cublas.sh`:
|
||||
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||
b. verify packages against repo `Packages.gz`
|
||||
c. extract headers for `bee-gpu-burn` worker build
|
||||
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||
8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
|
||||
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||
12. write staged `/etc/bee-release` (versions + git commit)
|
||||
13. patch staged `motd` with build metadata
|
||||
14. copy `iso/builder/` into a temporary live-build workdir under `dist/`
|
||||
15. sync staged overlay into workdir `config/includes.chroot/`
|
||||
16. run `lb config && lb build` inside the privileged builder container
|
||||
```
|
||||
|
||||
Build host notes:
|
||||
- `build-in-container.sh` targets `linux/amd64` builder containers by default, including Docker Desktop on macOS / Apple Silicon.
|
||||
- Override with `BEE_BUILDER_PLATFORM=<os/arch>` only if you intentionally need a different container platform.
|
||||
- If the local builder image under the same tag was previously built for the wrong architecture, the script rebuilds it automatically.
|
||||
|
||||
**Critical invariants:**
|
||||
- `DEBIAN_KERNEL_ABI` in `iso/builder/VERSIONS` pins the exact kernel ABI used in BOTH places:
|
||||
1. `setup-builder.sh` / `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||
- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||
- Container build requires `--privileged` because `live-build` uses mounts/chroots/loop devices during ISO assembly.
|
||||
- On macOS / Docker Desktop, the builder still must run as `linux/amd64` so the shipped ISO binaries remain `amd64`.
|
||||
- Operators must provision enough RAM to hold the full compressed live medium plus normal runtime overhead, because `toram` copies the entire read-only ISO payload into memory before the system reaches steady state.
|
||||
|
||||
## Post-boot smoke test
|
||||
|
||||
@@ -105,7 +126,7 @@ Key checks: NVIDIA modules loaded, `nvidia-smi` sees all GPUs, lib symlinks pres
|
||||
systemd services running, audit completed with NVIDIA enrichment, LAN reachability.
|
||||
|
||||
Current validation state:
|
||||
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and TUI startup
|
||||
- local/libvirt VM boot path is validated for `systemd`, SSH, `bee audit`, `bee-network`, and Web UI startup
|
||||
- real hardware validation is still required before treating the ISO as release-ready
|
||||
|
||||
## Overlay mechanism
|
||||
@@ -132,12 +153,31 @@ Current validation state:
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
Acceptance flows:
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-stress`
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
|
||||
- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
|
||||
- `bee sat memory` → `memtester` archive
|
||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||
- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||
- Ada / Hopper: add `fp8`
|
||||
- Blackwell+: add `fp4`
|
||||
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||
- Runtime overrides:
|
||||
- `BEE_GPU_STRESS_SECONDS`
|
||||
- `BEE_GPU_STRESS_SIZE_MB`
|
||||
- `BEE_MEMTESTER_SIZE_MB`
|
||||
- `BEE_MEMTESTER_PASSES`
|
||||
|
||||
## NVIDIA SAT Web UI flow
|
||||
|
||||
```
|
||||
Web UI: Acceptance Tests page → Run Test button
|
||||
1. POST /api/sat/nvidia/run → returns job_id
|
||||
2. GET /api/sat/stream?job_id=... (SSE) — streams stdout/stderr lines live
|
||||
3. After completion — archive written to /appdata/bee/export/bee-sat/
|
||||
summary.txt contains overall_status (OK / FAILED) and per-job status values
|
||||
```
|
||||
|
||||
**Critical invariants:**
|
||||
- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
|
||||
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||
|
||||
@@ -21,13 +21,14 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||
- Machine-readable health summary derived from collector verdicts
|
||||
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||
- NVIDIA SAT includes both diagnostic collection and lightweight GPU stress via `bee-gpu-stress`
|
||||
- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||
- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||
- Automatic boot audit with operator-facing local console and SSH access
|
||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||
- SSH access (OpenSSH) always available for inspection and debugging
|
||||
- Interactive Go TUI via `bee tui` for network setup, service management, and acceptance tests
|
||||
- Read-only web viewer via `bee web`, rendering the latest audit snapshot through the embedded Reanimator Chart
|
||||
- Local `tty1` operator UX: `bee` autologin, `menu` auto-start, privileged actions via `sudo -n`
|
||||
- Full web UI via `bee web` on port 80: interactive control panel with live metrics, SAT tests, network config, service management, export, and tools
|
||||
- Local operator desktop: openbox + Xorg + Chromium auto-opening `http://localhost/`
|
||||
- Local `tty1` operator UX: `bee` autologin, openbox desktop auto-starts with Chromium on `http://localhost/`
|
||||
|
||||
## Network isolation — CRITICAL
|
||||
|
||||
@@ -69,15 +70,18 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| SSH | OpenSSH server |
|
||||
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||
| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||
|
||||
## Operator UX
|
||||
|
||||
- On the live ISO, `tty1` autologins as `bee`
|
||||
- The login profile auto-runs `menu`, which enters the Go TUI
|
||||
- The TUI itself executes privileged actions as `root` via `sudo -n`
|
||||
- `bee-desktop.service` starts X11 + openbox + Chromium on display `:0`
|
||||
- Chromium opens `http://localhost/` — the full web UI
|
||||
- SSH remains available independently of the local console path
|
||||
- Remote operators can open `http://<ip>/` in any browser on the same LAN
|
||||
- VM-oriented builds also include `qemu-guest-agent` and serial console support for debugging
|
||||
- The ISO boots with `toram`, so loss of the original USB/BMC virtual media after boot should not break already-installed runtime binaries
|
||||
|
||||
## Runtime split
|
||||
|
||||
@@ -85,6 +89,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- Live-ISO-only responsibilities stay in `iso/` integration code
|
||||
- Live ISO launches the Go CLI with `--runtime livecd`
|
||||
- Local/manual runs use `--runtime auto` or `--runtime local`
|
||||
- Live ISO targets must have enough RAM for the full compressed live medium plus runtime working set because the boot medium is copied into memory at startup
|
||||
|
||||
## Key paths
|
||||
|
||||
@@ -99,7 +104,10 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
||||
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||
| `iso/overlay/etc/profile.d/bee.sh` | `menu` helper + tty1 auto-start policy |
|
||||
| `iso/overlay/home/bee/.profile` | `bee` shell profile for local console startup |
|
||||
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||
| `iso/overlay/home/bee/.profile` | `bee` shell profile (PATH only) |
|
||||
| `iso/overlay/etc/systemd/system/bee-desktop.service` | starts X11 + openbox + chromium |
|
||||
| `iso/overlay/usr/local/bin/bee-desktop` | startx wrapper for bee-desktop.service |
|
||||
| `iso/overlay/usr/local/bin/bee-openbox-session` | xinitrc: tint2 + chromium + openbox |
|
||||
| `dist/` | Build outputs (gitignored) |
|
||||
| `iso/out/` | Downloaded ISO files (gitignored) |
|
||||
|
||||
@@ -1,5 +1,26 @@
|
||||
# Backlog
|
||||
|
||||
## BMC версия через IPMI
|
||||
|
||||
**Статус:** реализовано.
|
||||
|
||||
Добавить сбор версии BMC firmware в board collector:
|
||||
- Команда: `ipmitool mc info` → поле `Firmware Revision`
|
||||
- Записывать в `hardware.firmware[]` как `{device_name: "BMC", version: "..."}`
|
||||
- Показывать в TUI правой колонке рядом с BIOS версией
|
||||
- Graceful skip если `/dev/ipmi0` отсутствует (silent: same pattern as PSU collector)
|
||||
|
||||
## CPU acceptance test через stress-ng
|
||||
|
||||
**Статус:** реализовано. CPU в Health Check получает PASS/FAIL из summary.txt.
|
||||
|
||||
Добавить CPU SAT на базе `stress-ng`:
|
||||
- Bake `stress-ng` в ISO (добавить в `bee.list.chroot`)
|
||||
- Новый `bee sat cpu` — запускает `stress-ng --cpu 0 --cpu-method all --timeout <N>` где N = duration из режима (Quick=60s, Standard=300s, Express=900s)
|
||||
- Параллельно снимает температуры через `sensors` и throttle-флаги из аудит JSON
|
||||
- Результат: SAT архив с summary.txt в формате других SAT (overall_status=OK/FAILED)
|
||||
- После реализации: CPU в Health Check получает реальный PASS/FAIL статус
|
||||
|
||||
## Real hardware validation
|
||||
|
||||
**Статус:** ожидает доступа к железу.
|
||||
|
||||
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
|
||||
- Kernel modules and nvidia-smi come from a single verified source.
|
||||
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
||||
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
||||
- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
|
||||
- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
|
||||
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
||||
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
||||
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
title: Hardware Ingest JSON Contract
|
||||
version: "2.1"
|
||||
version: "2.7"
|
||||
updated: "2026-03-15"
|
||||
maintainer: Reanimator Core
|
||||
audience: external-integrators, ai-agents
|
||||
@@ -9,7 +9,7 @@ language: ru
|
||||
|
||||
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||
|
||||
Версия: **2.1** · Дата: **2026-03-15**
|
||||
Версия: **2.7** · Дата: **2026-03-15**
|
||||
|
||||
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||
@@ -22,6 +22,9 @@ language: ru
|
||||
|
||||
| Версия | Дата | Изменения |
|
||||
|--------|------|-----------|
|
||||
| 2.7 | 2026-03-15 | Явно запрещён синтез данных в `event_logs`; интеграторы не должны придумывать серийные номера компонентов, если источник их не отдал |
|
||||
| 2.6 | 2026-03-15 | Добавлена необязательная секция `event_logs` для dedup/upsert логов `host` / `bmc` / `redfish` вне history timeline |
|
||||
| 2.5 | 2026-03-15 | Добавлено общее необязательное поле `manufactured_year_week` для компонентных секций (`YYYY-Www`) |
|
||||
| 2.4 | 2026-03-15 | Добавлена первая волна component telemetry: health/life поля для `cpus`, `memory`, `storage`, `pcie_devices`, `power_supplies` |
|
||||
| 2.3 | 2026-03-15 | Добавлены component telemetry поля: `pcie_devices.temperature_c`, `pcie_devices.power_w`, `power_supplies.temperature_c` |
|
||||
| 2.2 | 2026-03-15 | Добавлено поле `numa_node` у `pcie_devices` для topology/affinity |
|
||||
@@ -38,6 +41,7 @@ language: ru
|
||||
3. **Частичность** — можно передавать только те секции, данные по которым доступны. Пустой массив и отсутствие секции эквивалентны.
|
||||
4. **Строгая схема** — endpoint использует строгий JSON-декодер; неизвестные поля приводят к `400 Bad Request`.
|
||||
5. **Event-driven** — импорт создаёт события в timeline (LOG_COLLECTED, INSTALLED, REMOVED, FIRMWARE_CHANGED и др.).
|
||||
6. **Без синтеза со стороны интегратора** — сборщик передаёт только фактически собранные значения. Нельзя придумывать `serial_number`, `component_ref`, `message`, `message_id` или другие идентификаторы/атрибуты, если источник их не предоставил или парсер не смог их надёжно извлечь.
|
||||
|
||||
---
|
||||
|
||||
@@ -127,7 +131,8 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"storage": [ ... ],
|
||||
"pcie_devices": [ ... ],
|
||||
"power_supplies": [ ... ],
|
||||
"sensors": { ... }
|
||||
"sensors": { ... },
|
||||
"event_logs": [ ... ]
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -157,6 +162,7 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `status_changed_at` | string RFC3339 | Время последнего изменения статуса |
|
||||
| `status_history` | array | История переходов статусов (см. ниже) |
|
||||
| `error_description` | string | Текст ошибки/диагностики |
|
||||
| `manufactured_year_week` | string | Дата производства в формате `YYYY-Www`, например `2024-W07` |
|
||||
|
||||
**Объект `status_history[]`:**
|
||||
|
||||
@@ -178,6 +184,7 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
- Если источник хранит историю — передавайте `status_history` отсортированным по `changed_at` по возрастанию.
|
||||
- Не включайте записи `status_history` без `changed_at`.
|
||||
- Все даты — RFC3339, рекомендуется UTC (`Z`).
|
||||
- `manufactured_year_week` используйте, когда источник знает только год и неделю производства, без точной календарной даты.
|
||||
|
||||
---
|
||||
|
||||
@@ -250,12 +257,14 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||
| `serial_number` | string | нет | Серийный номер (если доступен) |
|
||||
| `firmware` | string | нет | Версия микрокода |
|
||||
| `firmware` | string | нет | Версия микрокода; если логгер отдает `Microcode level`, передавайте его сюда как есть |
|
||||
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||
| + общие поля статуса | | | см. раздел выше |
|
||||
|
||||
**Генерация serial_number при отсутствии:** `{board_serial}-CPU-{socket}`
|
||||
|
||||
Если источник использует поле/лейбл `Microcode level`, его значение передавайте в `cpus[].firmware` без дополнительного преобразования.
|
||||
|
||||
```json
|
||||
"cpus": [
|
||||
{
|
||||
@@ -282,7 +291,6 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `slot` | string | нет | Идентификатор слота |
|
||||
| `location` | string | нет | Физическое расположение |
|
||||
| `present` | bool | нет | Наличие модуля (по умолчанию `true`) |
|
||||
| `serial_number` | string | нет | Серийный номер |
|
||||
| `part_number` | string | нет | Партномер (используется как модель) |
|
||||
@@ -328,7 +336,7 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `slot` | string | нет | Идентификатор слота |
|
||||
| `slot` | string | нет | Канонический адрес установки PCIe-устройства; передавайте BDF (`0000:18:00.0`) |
|
||||
| `serial_number` | string | нет | Серийный номер |
|
||||
| `model` | string | нет | Модель |
|
||||
| `manufacturer` | string | нет | Производитель |
|
||||
@@ -404,7 +412,7 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm |
|
||||
| `sfp_voltage_v` | float | нет | Напряжение SFP, В |
|
||||
| `sfp_bias_ma` | float | нет | Bias current SFP, мА |
|
||||
| `bdf` | string | нет | Bus:Device.Function, например `0000:18:00.0` |
|
||||
| `bdf` | string | нет | Deprecated alias для `slot`; при наличии ingest нормализует его в `slot` |
|
||||
| `device_class` | string | нет | Класс устройства (см. список ниже) |
|
||||
| `manufacturer` | string | нет | Производитель |
|
||||
| `model` | string | нет | Модель |
|
||||
@@ -421,7 +429,9 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
`numa_node` передавайте для NIC / InfiniBand / RAID / GPU, когда источник знает CPU/NUMA affinity. Поле сохраняется в snapshot-атрибутах PCIe-компонента и дублируется в telemetry для topology use cases.
|
||||
Поля `temperature_c` и `power_w` используйте для device-level telemetry GPU / accelerator / smart PCIe devices. Они не влияют на идентификацию компонента.
|
||||
|
||||
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`
|
||||
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`, где `slot` для PCIe равен BDF.
|
||||
|
||||
`slot` — единственный канонический адрес компонента. Для PCIe в `slot` передавайте BDF. Поле `bdf` сохраняется только как переходный alias на входе и не должно использоваться как отдельная координата рядом со `slot`.
|
||||
|
||||
**Значения `device_class`:**
|
||||
|
||||
@@ -441,7 +451,7 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "PCIeCard2",
|
||||
"slot": "0000:3b:00.0",
|
||||
"vendor_id": 5555,
|
||||
"device_id": 4401,
|
||||
"numa_node": 0,
|
||||
@@ -450,7 +460,6 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"sfp_temperature_c": 36.2,
|
||||
"sfp_tx_power_dbm": -1.8,
|
||||
"sfp_rx_power_dbm": -2.1,
|
||||
"bdf": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"manufacturer": "Intel",
|
||||
"model": "X710 10GbE",
|
||||
@@ -526,6 +535,58 @@ PSU без `serial_number` игнорируется.
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### event_logs
|
||||
|
||||
Нормализованные операционные логи сервера из `host`, `bmc` или `redfish`.
|
||||
|
||||
Эти записи не попадают в history timeline и не создают history events. Они сохраняются в отдельной deduplicated log store и отображаются в отдельном UI-блоке asset logs / host logs.
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `source` | string | **да** | Источник лога: `host`, `bmc`, `redfish` |
|
||||
| `event_time` | string RFC3339 | нет | Время события из источника; если отсутствует, используется время ingest/collection |
|
||||
| `severity` | string | нет | Уровень: `OK`, `Info`, `Warning`, `Critical`, `Unknown` |
|
||||
| `message_id` | string | нет | Идентификатор/код события источника |
|
||||
| `message` | string | **да** | Нормализованный текст события |
|
||||
| `component_ref` | string | нет | Ссылка на компонент/устройство/слот, если извлекается |
|
||||
| `fingerprint` | string | нет | Внешний готовый dedup-key; если не передан, система вычисляет свой |
|
||||
| `is_active` | bool | нет | Признак, что событие всё ещё активно/не погашено, если источник умеет lifecycle |
|
||||
| `raw_payload` | object | нет | Сырой vendor-specific payload для диагностики |
|
||||
|
||||
**Правила event_logs:**
|
||||
- Логи дедуплицируются в рамках asset + source + fingerprint.
|
||||
- Если `fingerprint` не передан, система строит его из нормализованных полей (`source`, `message_id`, `message`, `component_ref`, временная нормализация).
|
||||
- Интегратор/сборщик логов не должен синтезировать содержимое событий: не придумывайте `message`, `message_id`, `component_ref`, serial/device identifiers или иные поля, если они отсутствуют в исходном логе или не были надёжно извлечены.
|
||||
- Повторное получение того же события обновляет `last_seen_at`/счётчик повторов и не должно создавать новый timeline/history event.
|
||||
- `event_logs` используются для отдельного UI-представления логов и не изменяют canonical state компонентов/asset по умолчанию.
|
||||
|
||||
```json
|
||||
"event_logs": [
|
||||
{
|
||||
"source": "bmc",
|
||||
"event_time": "2026-03-15T14:03:11Z",
|
||||
"severity": "Warning",
|
||||
"message_id": "0x000F",
|
||||
"message": "Correctable ECC error threshold exceeded",
|
||||
"component_ref": "CPU0_C0D0",
|
||||
"raw_payload": {
|
||||
"sensor": "DIMM_A1",
|
||||
"sel_record_id": "0042"
|
||||
}
|
||||
},
|
||||
{
|
||||
"source": "redfish",
|
||||
"event_time": "2026-03-15T14:03:20Z",
|
||||
"severity": "Info",
|
||||
"message_id": "OpenBMC.0.1.SystemReboot",
|
||||
"message": "System reboot requested by administrator",
|
||||
"component_ref": "Mainboard"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
#### sensors.fans
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
@@ -608,10 +669,12 @@ PSU без `serial_number` игнорируется.
|
||||
|
||||
## Обработка отсутствующих serial_number
|
||||
|
||||
Общее правило для всех секций: если источник не вернул серийный номер и сборщик не смог его надёжно извлечь, интегратор не должен подставлять вымышленные значения, хеши, локальные placeholder-идентификаторы или серийные номера "по догадке". Разрешены только явно оговорённые ниже server-side fallback-правила ingest.
|
||||
|
||||
| Тип | Поведение |
|
||||
|-----|-----------|
|
||||
| CPU | Генерируется: `{board_serial}-CPU-{socket}` |
|
||||
| PCIe | Генерируется: `{board_serial}-PCIE-{slot}` (если serial = `"N/A"` или пустой) |
|
||||
| PCIe | Генерируется: `{board_serial}-PCIE-{slot}` (если serial = `"N/A"` или пустой; `slot` для PCIe = BDF) |
|
||||
| Memory | Компонент игнорируется |
|
||||
| Storage | Компонент игнорируется |
|
||||
| PSU | Компонент игнорируется |
|
||||
@@ -687,7 +750,7 @@ PSU без `serial_number` игнорируется.
|
||||
],
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "PCIeCard1",
|
||||
"slot": "0000:18:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"manufacturer": "Intel",
|
||||
"model": "X710 10GbE",
|
||||
|
||||
22
bible-local/docs/iso-build-rules.md
Normal file
22
bible-local/docs/iso-build-rules.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# ISO Build Rules
|
||||
|
||||
## Verify package names before use
|
||||
|
||||
ISO builds take 30–60 minutes. A wrong package name wastes an entire build cycle.
|
||||
|
||||
**Rule: before adding any Debian package name to the ISO config, verify it exists and check its file list.**
|
||||
|
||||
Use one of:
|
||||
- `https://packages.debian.org/bookworm/<package-name>` — existence + description
|
||||
- `https://packages.debian.org/bookworm/amd64/<package-name>/filelist` — exact files installed
|
||||
- `apt-cache show <package>` inside a Debian bookworm container
|
||||
|
||||
This applies to:
|
||||
- `iso/builder/config/package-lists/*.list.chroot`
|
||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||
|
||||
## Memtest rule
|
||||
|
||||
Prefer live-build's built-in memtest integration over custom hooks or hardcoded
|
||||
bootloader paths. If you ever need to reference memtest files manually, verify
|
||||
the exact package file list first for the target Debian release.
|
||||
35
bible-local/docs/validate-vs-burn.md
Normal file
35
bible-local/docs/validate-vs-burn.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Validate vs Burn: Hardware Impact Policy
|
||||
|
||||
## Validate Tests (non-destructive)
|
||||
|
||||
Tests on the **Validate** page are purely diagnostic. They:
|
||||
|
||||
- **Do not write to disks** — no data is written to storage devices; SMART counters (power-on hours, load cycle count, reallocated sectors) are not incremented.
|
||||
- **Do not run sustained high load** — commands complete quickly (seconds to minutes) and do not push hardware to thermal or electrical limits.
|
||||
- **Do not increment hardware wear counters** — GPU memory ECC counters, NVMe wear leveling counters, and similar endurance metrics are unaffected.
|
||||
- **Are safe to run repeatedly** — on new, production-bound, or already-deployed hardware without concern for reducing lifespan.
|
||||
|
||||
### What Validate tests actually do
|
||||
|
||||
| Test | What it runs |
|
||||
|---|---|
|
||||
| NVIDIA GPU | `nvidia-smi`, `dcgmi diag` (levels 1–4 read-only diagnostics) |
|
||||
| Memory | `memtester` on a limited allocation; reads/writes to RAM only |
|
||||
| Storage | `smartctl -a`, `nvme smart-log` — reads SMART data only |
|
||||
| CPU | `stress-ng` for a bounded duration; CPU-only, no I/O |
|
||||
| AMD GPU | `rocm-smi --showallinfo`, `dmidecode` — read-only queries |
|
||||
|
||||
## Burn Tests (hardware wear)
|
||||
|
||||
Tests on the **Burn** page run hardware at maximum or near-maximum load for extended durations. They:
|
||||
|
||||
- **Wear storage**: write-intensive patterns can reduce SSD endurance (P/E cycles).
|
||||
- **Stress GPU memory**: extended ECC stress tests may surface latent defects but also exercise memory cells.
|
||||
- **Accelerate thermal cycling**: repeated heat/cool cycles degrade solder joints and capacitors over time.
|
||||
- **May increment wear counters**: GPU power-on hours, NVMe media wear indicator, and similar metrics will advance.
|
||||
|
||||
### Rule
|
||||
|
||||
> Run **Validate** freely on any server, at any time, before or after deployment.
|
||||
> Run **Burn** only when explicitly required (e.g., initial acceptance after repair, or per customer SLA).
|
||||
> Document when and why Burn tests were run.
|
||||
Submodule internal/chart updated: 05db6994d4...ac8120c8ab
59
iso/README.md
Normal file
59
iso/README.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# ISO Build
|
||||
|
||||
`bee` ISO is built inside a Debian 12 builder container via `iso/builder/build-in-container.sh`.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Docker Desktop or another Docker-compatible container runtime
|
||||
- Privileged containers enabled
|
||||
- Enough free disk space for builder cache, Debian live-build artifacts, NVIDIA driver cache, and CUDA userspace packages
|
||||
|
||||
## Build On macOS
|
||||
|
||||
From the repository root:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh
|
||||
```
|
||||
|
||||
The script defaults to `linux/amd64` builder containers, so it works on:
|
||||
|
||||
- Intel Mac
|
||||
- Apple Silicon (`M1` / `M2` / `M3` / `M4`) via Docker Desktop's Linux VM
|
||||
|
||||
You do not need to pass `--platform` manually for normal ISO builds.
|
||||
|
||||
## Useful Options
|
||||
|
||||
Build with explicit SSH keys baked into the ISO:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --authorized-keys ~/.ssh/id_ed25519.pub
|
||||
```
|
||||
|
||||
Rebuild the builder image:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --rebuild-image
|
||||
```
|
||||
|
||||
Use a custom cache directory:
|
||||
|
||||
```sh
|
||||
sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||
- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
|
||||
- Override the container platform only if you know why:
|
||||
|
||||
```sh
|
||||
BEE_BUILDER_PLATFORM=linux/amd64 sh iso/builder/build-in-container.sh
|
||||
```
|
||||
|
||||
- The shipped ISO is still `amd64`.
|
||||
- Output ISO artifacts are written under `dist/`.
|
||||
@@ -1,7 +1,6 @@
|
||||
FROM debian:12
|
||||
|
||||
ARG GO_VERSION=1.24.0
|
||||
ARG DEBIAN_KERNEL_ABI=6.1.0-43
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
@@ -24,9 +23,33 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
gcc \
|
||||
make \
|
||||
perl \
|
||||
"linux-headers-${DEBIAN_KERNEL_ABI}-amd64" \
|
||||
pkg-config \
|
||||
yasm \
|
||||
libssl-dev \
|
||||
zlib1g-dev \
|
||||
libbz2-dev \
|
||||
libgmp-dev \
|
||||
libpcap-dev \
|
||||
libsqlite3-dev \
|
||||
libcurl4-openssl-dev \
|
||||
ocl-icd-opencl-dev \
|
||||
linux-headers-amd64 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Add NVIDIA CUDA repo and install nvcc (needed to compile nccl-tests)
|
||||
RUN wget -qO /tmp/cuda-keyring.gpg \
|
||||
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/3bf863cc.pub \
|
||||
&& gpg --dearmor < /tmp/cuda-keyring.gpg \
|
||||
> /usr/share/keyrings/nvidia-cuda.gpg \
|
||||
&& rm /tmp/cuda-keyring.gpg \
|
||||
&& echo "deb [signed-by=/usr/share/keyrings/nvidia-cuda.gpg] \
|
||||
https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/ /" \
|
||||
> /etc/apt/sources.list.d/cuda.list \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -y cuda-nvcc-12-8 \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& ln -sfn /usr/local/cuda-12.8 /usr/local/cuda
|
||||
|
||||
RUN arch="$(dpkg --print-architecture)" \
|
||||
&& case "$arch" in \
|
||||
amd64) goarch=amd64 ;; \
|
||||
|
||||
@@ -1,5 +1,23 @@
|
||||
DEBIAN_VERSION=12
|
||||
DEBIAN_KERNEL_ABI=6.1.0-43
|
||||
DEBIAN_KERNEL_ABI=auto
|
||||
NVIDIA_DRIVER_VERSION=590.48.01
|
||||
NCCL_VERSION=2.28.9-1
|
||||
NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.2-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||
ROCM_VALIDATION_SUITE_VERSION=1.1.0.60304-76~22.04
|
||||
ROCBLAS_VERSION=4.3.0.60304-76~22.04
|
||||
ROCRAND_VERSION=3.2.0.60304-76~22.04
|
||||
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=0.1.1
|
||||
AUDIT_VERSION=1.0.0
|
||||
|
||||
@@ -7,6 +7,15 @@ set -e
|
||||
|
||||
. "$(dirname "$0")/../VERSIONS"
|
||||
|
||||
# Pin the exact kernel ABI detected by build.sh so the ISO kernel matches
|
||||
# the kernel headers used to compile NVIDIA modules. Falls back to meta-package
|
||||
# when lb config is run manually without the environment variable.
|
||||
if [ -n "${BEE_KERNEL_ABI:-}" ] && [ "${BEE_KERNEL_ABI}" != "auto" ]; then
|
||||
LB_LINUX_PACKAGES="linux-image-${BEE_KERNEL_ABI}"
|
||||
else
|
||||
LB_LINUX_PACKAGES="linux-image"
|
||||
fi
|
||||
|
||||
lb config noauto \
|
||||
--distribution bookworm \
|
||||
--architectures amd64 \
|
||||
@@ -19,10 +28,11 @@ lb config noauto \
|
||||
--mirror-binary "https://deb.debian.org/debian" \
|
||||
--security true \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "linux-image-${DEBIAN_KERNEL_ABI}" \
|
||||
--memtest none \
|
||||
--iso-volume "BEE" \
|
||||
--iso-application "Bee Hardware Audit" \
|
||||
--bootappend-live "boot=live components console=tty0 console=ttyS0,115200n8 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
190
iso/builder/build-cublas.sh
Normal file
190
iso/builder/build-cublas.sh
Normal file
@@ -0,0 +1,190 @@
|
||||
#!/bin/sh
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
|
||||
#
|
||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||
# - headers for compiling bee-gpu-burn worker against cuBLASLt
|
||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||
|
||||
set -e
|
||||
|
||||
CUBLAS_VERSION="$1"
|
||||
CUDA_USERSPACE_VERSION="$2"
|
||||
CUDA_SERIES="$3"
|
||||
DIST_DIR="$4"
|
||||
|
||||
[ -n "$CUBLAS_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_USERSPACE_VERSION" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$CUDA_SERIES" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <cublas-version> <cuda-userspace-version> <cuda-series> <dist-dir>"; exit 1; }
|
||||
|
||||
CUDA_SERIES_DASH=$(printf '%s' "$CUDA_SERIES" | tr '.' '-')
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||
CACHE_DIR="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${CUDA_SERIES}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/cublas-downloads"
|
||||
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||
|
||||
echo "=== cuBLAS ${CUBLAS_VERSION} / cudart ${CUDA_USERSPACE_VERSION} / CUDA ${CUDA_SERIES} ==="
|
||||
|
||||
if [ -f "${CACHE_DIR}/include/cublasLt.h" ] && [ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] \
|
||||
&& [ -f "${CACHE_DIR}/include/crt/host_defines.h" ] \
|
||||
&& [ -f "${CACHE_DIR}/include/nv/target" ] \
|
||||
&& [ "$(find "${CACHE_DIR}/lib" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== cuBLAS cached, skipping download ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/include" "${CACHE_DIR}/lib"
|
||||
|
||||
echo "=== downloading Packages.gz ==="
|
||||
wget -q -O "${PACKAGES_GZ}" "${REPO_BASE}/Packages.gz"
|
||||
|
||||
lookup_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2" # if empty, match any version (first found)
|
||||
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" -v ver="$ver" '
|
||||
/^Package: / { cur_pkg=$2; gsub(/\r/, "", cur_pkg) }
|
||||
/^Version: / { cur_ver=$2; gsub(/\r/, "", cur_ver) }
|
||||
/^Filename: / { cur_file=$2; gsub(/\r/, "", cur_file) }
|
||||
/^SHA256: / { cur_sha=$2; gsub(/\r/, "", cur_sha) }
|
||||
/^$/ {
|
||||
if (cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||
print cur_file " " cur_sha
|
||||
printed=1
|
||||
exit
|
||||
}
|
||||
cur_pkg=""; cur_ver=""; cur_file=""; cur_sha=""
|
||||
}
|
||||
END {
|
||||
if (!printed && cur_pkg == pkg && (ver == "" || cur_ver == ver)) {
|
||||
print cur_file " " cur_sha
|
||||
}
|
||||
}'
|
||||
}
|
||||
|
||||
download_verified_pkg() {
|
||||
pkg="$1"
|
||||
ver="$2"
|
||||
|
||||
meta="$(lookup_pkg "$pkg" "$ver")"
|
||||
[ -n "$meta" ] || { echo "ERROR: package metadata not found for ${pkg} ${ver}"; exit 1; }
|
||||
|
||||
repo_file="$(printf '%s\n' "$meta" | awk '{print $1}')"
|
||||
repo_sha="$(printf '%s\n' "$meta" | awk '{print $2}')"
|
||||
[ -n "$repo_file" ] || { echo "ERROR: package filename missing for ${pkg}"; exit 1; }
|
||||
[ -n "$repo_sha" ] || { echo "ERROR: package sha missing for ${pkg}"; exit 1; }
|
||||
|
||||
out="${DOWNLOAD_CACHE_DIR}/$(basename "$repo_file")"
|
||||
if [ -f "$out" ]; then
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" = "$repo_sha" ]; then
|
||||
echo "=== using cached $(basename "$repo_file") ===" >&2
|
||||
printf '%s\n' "$out"
|
||||
return 0
|
||||
fi
|
||||
echo "=== removing stale $(basename "$repo_file") (sha256 mismatch) ===" >&2
|
||||
rm -f "$out"
|
||||
fi
|
||||
|
||||
echo "=== downloading $(basename "$repo_file") ===" >&2
|
||||
wget --show-progress -O "$out" "${REPO_BASE}/$(basename "$repo_file")"
|
||||
|
||||
actual_sha="$(sha256sum "$out" | awk '{print $1}')"
|
||||
if [ "$actual_sha" != "$repo_sha" ]; then
|
||||
echo "ERROR: sha256 mismatch for $(basename "$repo_file")" >&2
|
||||
echo " expected: $repo_sha" >&2
|
||||
echo " actual: $actual_sha" >&2
|
||||
rm -f "$out"
|
||||
exit 1
|
||||
fi
|
||||
echo "sha256 OK: $(basename "$repo_file")" >&2
|
||||
printf '%s\n' "$out"
|
||||
}
|
||||
|
||||
extract_deb() {
|
||||
deb="$1"
|
||||
dst="$2"
|
||||
mkdir -p "$dst"
|
||||
(
|
||||
cd "$dst"
|
||||
ar x "$deb"
|
||||
data_tar=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$data_tar" ] || { echo "ERROR: data.tar.* not found in $deb"; exit 1; }
|
||||
tar xf "$data_tar"
|
||||
)
|
||||
}
|
||||
|
||||
copy_headers() {
|
||||
from="$1"
|
||||
if [ -d "${from}/usr/include" ]; then
|
||||
cp -a "${from}/usr/include/." "${CACHE_DIR}/include/"
|
||||
fi
|
||||
# NVIDIA CUDA packages install headers under /usr/local/cuda-X.Y/targets/x86_64-linux/include/
|
||||
find "$from" -type d -name include | while read -r inc_dir; do
|
||||
case "$inc_dir" in
|
||||
*/usr/include) ;; # already handled above
|
||||
*)
|
||||
if find "${inc_dir}" -maxdepth 3 \( -name '*.h' -o -type f \) | grep -q .; then
|
||||
cp -a "${inc_dir}/." "${CACHE_DIR}/include/"
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
copy_libs() {
|
||||
from="$1"
|
||||
find "$from" \( -name 'libcublas.so*' -o -name 'libcublasLt.so*' -o -name 'libcudart.so*' \) \
|
||||
\( -type f -o -type l \) -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
}
|
||||
|
||||
make_links() {
|
||||
base="$1"
|
||||
versioned=$(find "${CACHE_DIR}/lib" -maxdepth 1 -name "${base}.so.[0-9]*" -type f | sort | head -1)
|
||||
[ -n "$versioned" ] || return 0
|
||||
soname=$(printf '%s\n' "$versioned" | sed -E "s#.*/(${base}\.so\.[0-9]+).*#\\1#")
|
||||
target=$(basename "$versioned")
|
||||
ln -sf "$target" "${CACHE_DIR}/lib/${soname}" 2>/dev/null || true
|
||||
ln -sf "${soname}" "${CACHE_DIR}/lib/${base}.so" 2>/dev/null || true
|
||||
}
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMP_DIR"' EXIT INT TERM
|
||||
|
||||
CUBLAS_RT_DEB=$(download_verified_pkg "libcublas-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUBLAS_DEV_DEB=$(download_verified_pkg "libcublas-dev-${CUDA_SERIES_DASH}" "${CUBLAS_VERSION}")
|
||||
CUDART_RT_DEB=$(download_verified_pkg "cuda-cudart-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
CUDART_DEV_DEB=$(download_verified_pkg "cuda-cudart-dev-${CUDA_SERIES_DASH}" "${CUDA_USERSPACE_VERSION}")
|
||||
CUDA_CRT_DEB=$(download_verified_pkg "cuda-crt-${CUDA_SERIES_DASH}" "")
|
||||
CUDA_CCCL_DEB=$(download_verified_pkg "cuda-cccl-${CUDA_SERIES_DASH}" "")
|
||||
|
||||
extract_deb "$CUBLAS_RT_DEB" "${TMP_DIR}/cublas-rt"
|
||||
extract_deb "$CUBLAS_DEV_DEB" "${TMP_DIR}/cublas-dev"
|
||||
extract_deb "$CUDART_RT_DEB" "${TMP_DIR}/cudart-rt"
|
||||
extract_deb "$CUDART_DEV_DEB" "${TMP_DIR}/cudart-dev"
|
||||
extract_deb "$CUDA_CRT_DEB" "${TMP_DIR}/cuda-crt"
|
||||
extract_deb "$CUDA_CCCL_DEB" "${TMP_DIR}/cuda-cccl"
|
||||
|
||||
copy_headers "${TMP_DIR}/cublas-dev"
|
||||
copy_headers "${TMP_DIR}/cudart-dev"
|
||||
copy_headers "${TMP_DIR}/cuda-crt"
|
||||
copy_headers "${TMP_DIR}/cuda-cccl"
|
||||
copy_libs "${TMP_DIR}/cublas-rt"
|
||||
copy_libs "${TMP_DIR}/cudart-rt"
|
||||
|
||||
make_links "libcublas"
|
||||
make_links "libcublasLt"
|
||||
make_links "libcudart"
|
||||
|
||||
[ -f "${CACHE_DIR}/include/cublasLt.h" ] || { echo "ERROR: cublasLt.h not extracted"; exit 1; }
|
||||
[ -f "${CACHE_DIR}/include/cuda_runtime_api.h" ] || { echo "ERROR: cuda_runtime_api.h not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublasLt.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublasLt not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcublas.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcublas not extracted"; exit 1; }
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' | wc -l)" -gt 0 ] || { echo "ERROR: libcudart not extracted"; exit 1; }
|
||||
|
||||
echo "=== cuBLAS extraction complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "headers: $(find "${CACHE_DIR}/include" -type f | wc -l)"
|
||||
echo "libs: $(find "${CACHE_DIR}/lib" -maxdepth 1 \( -name 'libcublas*.so*' -o -name 'libcudart.so*' \) | wc -l)"
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
# build-in-container.sh — build the bee ISO inside a Debian container.
|
||||
# build-in-container.sh — build the bee ISO inside the Debian builder container.
|
||||
|
||||
set -e
|
||||
|
||||
@@ -7,9 +7,12 @@ REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
CLEAN_CACHE=0
|
||||
VARIANT="all"
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
|
||||
@@ -27,19 +30,54 @@ while [ $# -gt 0 ]; do
|
||||
AUTH_KEYS="$2"
|
||||
shift 2
|
||||
;;
|
||||
--clean-build)
|
||||
CLEAN_CACHE=1
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--variant)
|
||||
VARIANT="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--authorized-keys /path/to/authorized_keys]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
||||
rm -rf "${CACHE_DIR:?}/go-build" \
|
||||
"${CACHE_DIR:?}/go-mod" \
|
||||
"${CACHE_DIR:?}/tmp" \
|
||||
"${CACHE_DIR:?}/bee" \
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
if ! command -v "$CONTAINER_TOOL" >/dev/null 2>&1; then
|
||||
echo "container tool not found: $CONTAINER_TOOL" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PLATFORM_OS="${BUILDER_PLATFORM%/*}"
|
||||
PLATFORM_ARCH="${BUILDER_PLATFORM#*/}"
|
||||
if [ -z "$PLATFORM_OS" ] || [ -z "$PLATFORM_ARCH" ] || [ "$PLATFORM_OS" = "$BUILDER_PLATFORM" ]; then
|
||||
echo "invalid BEE_BUILDER_PLATFORM: ${BUILDER_PLATFORM} (expected os/arch, e.g. linux/amd64)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
[ -f "$AUTH_KEYS" ] || { echo "authorized_keys not found: $AUTH_KEYS" >&2; exit 1; }
|
||||
AUTH_KEYS_ABS="$(cd "$(dirname "$AUTH_KEYS")" && pwd)/$(basename "$AUTH_KEYS")"
|
||||
@@ -56,40 +94,101 @@ mkdir -p \
|
||||
|
||||
IMAGE_REF="${IMAGE_TAG}:debian${DEBIAN_VERSION}"
|
||||
|
||||
if [ "$REBUILD_IMAGE" = "1" ] || ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
image_matches_platform() {
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || true)"
|
||||
[ "$actual_platform" = "${BUILDER_PLATFORM}" ]
|
||||
}
|
||||
|
||||
NEED_BUILD_IMAGE=0
|
||||
if [ "$REBUILD_IMAGE" = "1" ]; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! "$CONTAINER_TOOL" image inspect "${IMAGE_REF}" >/dev/null 2>&1; then
|
||||
NEED_BUILD_IMAGE=1
|
||||
elif ! image_matches_platform; then
|
||||
actual_platform="$("$CONTAINER_TOOL" image inspect --format '{{.Os}}/{{.Architecture}}' "${IMAGE_REF}" 2>/dev/null || echo unknown)"
|
||||
echo "=== rebuilding builder image ${IMAGE_REF}: platform mismatch (${actual_platform} != ${BUILDER_PLATFORM}) ==="
|
||||
NEED_BUILD_IMAGE=1
|
||||
fi
|
||||
|
||||
if [ "$NEED_BUILD_IMAGE" = "1" ]; then
|
||||
"$CONTAINER_TOOL" build \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
--build-arg GO_VERSION="${GO_VERSION}" \
|
||||
--build-arg DEBIAN_KERNEL_ABI="${DEBIAN_KERNEL_ABI}" \
|
||||
-t "${IMAGE_REF}" \
|
||||
"${BUILDER_DIR}"
|
||||
else
|
||||
echo "=== using existing builder image ${IMAGE_REF} ==="
|
||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||
fi
|
||||
|
||||
set -- \
|
||||
run --rm --privileged \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
set -- run --rm --privileged \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
# Build base docker run args (without --authorized-keys)
|
||||
build_run_args() {
|
||||
_variant="$1"
|
||||
_auth_arg=""
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
echo "run --rm --privileged \
|
||||
--platform ${BUILDER_PLATFORM} \
|
||||
-v ${REPO_ROOT}:/work \
|
||||
-v ${CACHE_DIR}:/cache \
|
||||
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
${IMAGE_REF} \
|
||||
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
|
||||
}
|
||||
|
||||
"$CONTAINER_TOOL" "$@"
|
||||
run_variant() {
|
||||
_v="$1"
|
||||
echo "=== building variant: ${_v} ==="
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
else
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
fi
|
||||
}
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia)
|
||||
run_variant nvidia
|
||||
;;
|
||||
amd)
|
||||
run_variant amd
|
||||
;;
|
||||
nogpu)
|
||||
run_variant nogpu
|
||||
;;
|
||||
all)
|
||||
run_variant nvidia
|
||||
run_variant amd
|
||||
run_variant nogpu
|
||||
;;
|
||||
esac
|
||||
|
||||
55
iso/builder/build-john.sh
Normal file
55
iso/builder/build-john.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/sh
|
||||
# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
|
||||
#
|
||||
# Downloads a pinned source snapshot from the official openwall/john repository,
|
||||
# builds it inside the builder container, and caches the resulting run/ tree.
|
||||
|
||||
set -e
|
||||
|
||||
JOHN_COMMIT="$1"
|
||||
DIST_DIR="$2"
|
||||
|
||||
[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
|
||||
echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
|
||||
SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
|
||||
|
||||
if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
|
||||
echo "=== john cached, skipping build ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}"
|
||||
if [ ! -f "${SRC_TAR}" ]; then
|
||||
echo "=== downloading john source snapshot ==="
|
||||
wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
|
||||
fi
|
||||
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
|
||||
|
||||
cd "${BUILD_TMP}"
|
||||
tar xf "${SRC_TAR}"
|
||||
SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
|
||||
[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
|
||||
|
||||
cd "${SRC_DIR}/src"
|
||||
echo "=== configuring john ==="
|
||||
./configure
|
||||
echo "=== building john ==="
|
||||
make clean >/dev/null 2>&1 || true
|
||||
make -j"$(nproc)"
|
||||
|
||||
mkdir -p "${CACHE_DIR}"
|
||||
cp -a "../run" "${CACHE_DIR}/run"
|
||||
chmod +x "${CACHE_DIR}/run/john"
|
||||
|
||||
echo "=== john build complete ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
164
iso/builder/build-nccl-tests.sh
Executable file
164
iso/builder/build-nccl-tests.sh
Executable file
@@ -0,0 +1,164 @@
|
||||
#!/bin/sh
|
||||
# build-nccl-tests.sh — build nccl-tests all_reduce_perf for the LiveCD.
|
||||
#
|
||||
# Downloads nccl-tests source from GitHub, downloads libnccl-dev .deb for
|
||||
# nccl.h, and compiles all_reduce_perf with nvcc (cuda-nvcc-13-0).
|
||||
#
|
||||
# Output is cached in DIST_DIR/nccl-tests-<version>/ so subsequent builds
|
||||
# are instant unless NCCL_TESTS_VERSION changes.
|
||||
#
|
||||
# Output layout:
|
||||
# $CACHE_DIR/bin/all_reduce_perf
|
||||
# $CACHE_DIR/lib/libcudart.so* copied from the nvcc toolchain used to build nccl-tests
|
||||
|
||||
set -e
|
||||
|
||||
NCCL_TESTS_VERSION="$1"
|
||||
NCCL_VERSION="$2"
|
||||
NCCL_CUDA_VERSION="$3"
|
||||
DIST_DIR="$4"
|
||||
NVCC_VERSION="${5:-}"
|
||||
DEBIAN_VERSION="${6:-12}"
|
||||
|
||||
[ -n "$NCCL_TESTS_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-tests-version> <nccl-version> <cuda-version> <dist-dir> [nvcc-version] [debian-version]"; exit 1; }
|
||||
|
||||
echo "=== nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||
|
||||
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ] && [ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== nccl-tests cached, skipping build ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Resolve nvcc path (cuda-nvcc-X-Y installs to /usr/local/cuda-X.Y/bin/nvcc)
|
||||
NVCC_VERSION_PATH="$(echo "${NVCC_VERSION}" | tr '.' '.')"
|
||||
NVCC=""
|
||||
for candidate in nvcc "/usr/local/cuda-${NVCC_VERSION_PATH}/bin/nvcc" /usr/local/cuda-12/bin/nvcc /usr/local/cuda/bin/nvcc; do
|
||||
if command -v "$candidate" >/dev/null 2>&1 || [ -x "$candidate" ]; then
|
||||
NVCC="$candidate"
|
||||
break
|
||||
fi
|
||||
done
|
||||
[ -n "$NVCC" ] || { echo "ERROR: nvcc not found — install cuda-nvcc-$(echo "${NVCC_VERSION}" | tr '.' '-')"; exit 1; }
|
||||
echo "nvcc: $NVCC"
|
||||
|
||||
# Determine CUDA_HOME from nvcc location
|
||||
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||
echo "CUDA_HOME: $CUDA_HOME"
|
||||
|
||||
find_cudart_dir() {
|
||||
for dir in \
|
||||
"${CUDA_HOME}/targets/x86_64-linux/lib" \
|
||||
"${CUDA_HOME}/targets/x86_64-linux/lib/stubs" \
|
||||
"${CUDA_HOME}/lib64" \
|
||||
"${CUDA_HOME}/lib"; do
|
||||
if [ -d "$dir" ] && find "$dir" -maxdepth 1 -name 'libcudart.so*' -type f | grep -q .; then
|
||||
printf '%s\n' "$dir"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
CUDART_DIR="$(find_cudart_dir)" || { echo "ERROR: libcudart.so* not found under ${CUDA_HOME}"; exit 1; }
|
||||
echo "cudart dir: $CUDART_DIR"
|
||||
|
||||
# Download libnccl-dev for nccl.h
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
|
||||
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||
DEV_URL="${REPO_BASE}/${DEV_PKG}"
|
||||
|
||||
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
||||
DEV_DEB="${DOWNLOAD_CACHE_DIR}/${DEV_PKG}"
|
||||
|
||||
if [ ! -f "$DEV_DEB" ]; then
|
||||
echo "=== downloading libnccl-dev ==="
|
||||
wget --show-progress -O "$DEV_DEB" "$DEV_URL"
|
||||
fi
|
||||
|
||||
# Extract nccl.h from libnccl-dev
|
||||
NCCL_INCLUDE_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "$NCCL_INCLUDE_TMP" "$BUILD_TMP"' EXIT INT TERM
|
||||
|
||||
cd "$NCCL_INCLUDE_TMP"
|
||||
ar x "$DEV_DEB"
|
||||
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in libnccl-dev .deb"; exit 1; }
|
||||
tar xf "$DATA_TAR"
|
||||
|
||||
# nccl.h lands in ./usr/include/ or ./usr/local/cuda-X.Y/targets/.../include/
|
||||
NCCL_H=$(find . -name 'nccl.h' -type f 2>/dev/null | head -1)
|
||||
[ -n "$NCCL_H" ] || { echo "ERROR: nccl.h not found in libnccl-dev package"; exit 1; }
|
||||
NCCL_INCLUDE_DIR="$(pwd)/$(dirname "$NCCL_H")"
|
||||
echo "nccl.h: $NCCL_H"
|
||||
|
||||
# libnccl.so comes from the already-built NCCL cache (build-nccl.sh ran first)
|
||||
NCCL_LIB_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}/lib"
|
||||
[ -d "$NCCL_LIB_DIR" ] || { echo "ERROR: NCCL lib dir not found at $NCCL_LIB_DIR — run build-nccl.sh first"; exit 1; }
|
||||
echo "nccl lib: $NCCL_LIB_DIR"
|
||||
|
||||
# Download nccl-tests source
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/nccl-tests-v${NCCL_TESTS_VERSION}.tar.gz"
|
||||
SRC_URL="https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${NCCL_TESTS_VERSION}.tar.gz"
|
||||
|
||||
if [ ! -f "$SRC_TAR" ]; then
|
||||
echo "=== downloading nccl-tests v${NCCL_TESTS_VERSION} ==="
|
||||
wget --show-progress -O "$SRC_TAR" "$SRC_URL"
|
||||
fi
|
||||
|
||||
# Extract and build
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
cd "$BUILD_TMP"
|
||||
tar xf "$SRC_TAR"
|
||||
SRC_DIR=$(ls -d nccl-tests-* 2>/dev/null | head -1)
|
||||
[ -n "$SRC_DIR" ] || { echo "ERROR: source directory not found in archive"; exit 1; }
|
||||
cd "$SRC_DIR"
|
||||
|
||||
echo "=== building all_reduce_perf ==="
|
||||
# Pick gencode based on the actual nvcc version:
|
||||
# CUDA 12.x — Volta..Blackwell (sm_70..sm_100)
|
||||
# CUDA 13.x — Hopper..Blackwell (sm_90..sm_100, Pascal/Volta/Ampere dropped)
|
||||
NVCC_MAJOR=$("$NVCC" --version 2>/dev/null | grep -oE 'release [0-9]+' | awk '{print $2}' | head -1)
|
||||
echo "nvcc major version: ${NVCC_MAJOR:-unknown}"
|
||||
if [ "${NVCC_MAJOR:-0}" -ge 13 ] 2>/dev/null; then
|
||||
GENCODE="-gencode=arch=compute_90,code=sm_90 \
|
||||
-gencode=arch=compute_100,code=sm_100"
|
||||
echo "gencode: sm_90 sm_100 (CUDA 13+)"
|
||||
else
|
||||
GENCODE="-gencode=arch=compute_70,code=sm_70 \
|
||||
-gencode=arch=compute_80,code=sm_80 \
|
||||
-gencode=arch=compute_86,code=sm_86 \
|
||||
-gencode=arch=compute_90,code=sm_90 \
|
||||
-gencode=arch=compute_100,code=sm_100"
|
||||
echo "gencode: sm_70..sm_100 (CUDA 12)"
|
||||
fi
|
||||
LIBRARY_PATH="$NCCL_LIB_DIR${LIBRARY_PATH:+:$LIBRARY_PATH}" \
|
||||
make MPI=0 \
|
||||
NVCC="$NVCC" \
|
||||
CUDA_HOME="$CUDA_HOME" \
|
||||
NCCL_HOME="$NCCL_INCLUDE_DIR/.." \
|
||||
NCCL_LIB="$NCCL_LIB_DIR" \
|
||||
NVCC_GENCODE="$GENCODE" \
|
||||
BUILDDIR="./build"
|
||||
|
||||
[ -f "./build/all_reduce_perf" ] || { echo "ERROR: all_reduce_perf not found after build"; exit 1; }
|
||||
|
||||
mkdir -p "${CACHE_DIR}/bin"
|
||||
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
|
||||
mkdir -p "${CACHE_DIR}/lib"
|
||||
find "${CUDART_DIR}" -maxdepth 1 -name 'libcudart.so*' -type f -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' -type f | wc -l)" -gt 0 ] || { echo "ERROR: libcudart runtime copy failed"; exit 1; }
|
||||
|
||||
echo "=== nccl-tests build complete ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
ls -lh "${CACHE_DIR}/lib/"libcudart.so* 2>/dev/null || true
|
||||
94
iso/builder/build-nccl.sh
Executable file
94
iso/builder/build-nccl.sh
Executable file
@@ -0,0 +1,94 @@
|
||||
#!/bin/sh
|
||||
# build-nccl.sh — download and extract NCCL shared library for the LiveCD.
|
||||
#
|
||||
# Downloads libnccl2 .deb from NVIDIA's CUDA apt repository (Debian 12, x86_64)
|
||||
# and extracts the shared library. Package integrity verified via sha256.
|
||||
#
|
||||
# Output is cached in DIST_DIR/nccl-<version>+cuda<cuda>/ so subsequent builds
|
||||
# are instant unless NCCL_VERSION or NCCL_CUDA_VERSION changes.
|
||||
#
|
||||
# Output layout:
|
||||
# $CACHE_DIR/lib/ — libnccl.so.* files
|
||||
|
||||
set -e
|
||||
|
||||
NCCL_VERSION="$1"
|
||||
NCCL_CUDA_VERSION="$2"
|
||||
DIST_DIR="$3"
|
||||
EXPECTED_SHA256="$4"
|
||||
|
||||
[ -n "$NCCL_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
||||
[ -n "$NCCL_CUDA_VERSION" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nccl-version> <cuda-version> <dist-dir> [sha256]"; exit 1; }
|
||||
|
||||
echo "=== NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-downloads"
|
||||
|
||||
if [ -d "$CACHE_DIR/lib" ] && [ "$(ls "$CACHE_DIR/lib/"libnccl.so.* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NCCL cached, skipping download ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "libs: $(ls "$CACHE_DIR/lib/" | wc -l) files"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64"
|
||||
PKG_NAME="libnccl2_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||
PKG_URL="${REPO_BASE}/${PKG_NAME}"
|
||||
|
||||
mkdir -p "$DOWNLOAD_CACHE_DIR"
|
||||
DEB_FILE="${DOWNLOAD_CACHE_DIR}/${PKG_NAME}"
|
||||
|
||||
echo "=== downloading NCCL package ==="
|
||||
echo "URL: ${PKG_URL}"
|
||||
wget --show-progress -O "$DEB_FILE" "$PKG_URL"
|
||||
|
||||
if [ -n "$EXPECTED_SHA256" ]; then
|
||||
echo "=== verifying sha256 ==="
|
||||
ACTUAL_SHA256=$(sha256sum "$DEB_FILE" | awk '{print $1}')
|
||||
if [ "$ACTUAL_SHA256" != "$EXPECTED_SHA256" ]; then
|
||||
echo "ERROR: sha256 mismatch"
|
||||
echo " expected: $EXPECTED_SHA256"
|
||||
echo " actual: $ACTUAL_SHA256"
|
||||
rm -f "$DEB_FILE"
|
||||
exit 1
|
||||
fi
|
||||
echo "sha256 OK"
|
||||
fi
|
||||
|
||||
echo "=== extracting NCCL libraries ==="
|
||||
EXTRACT_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "$EXTRACT_TMP"' EXIT INT TERM
|
||||
|
||||
# .deb is an ar archive; data.tar.* contains the actual files
|
||||
cd "$EXTRACT_TMP"
|
||||
ar x "$DEB_FILE"
|
||||
|
||||
# Extract data tarball (xz, gz, or zst)
|
||||
DATA_TAR=$(ls data.tar.* 2>/dev/null | head -1)
|
||||
[ -n "$DATA_TAR" ] || { echo "ERROR: data.tar.* not found in .deb"; exit 1; }
|
||||
tar xf "$DATA_TAR"
|
||||
|
||||
# Library lands in ./usr/lib/x86_64-linux-gnu/ or ./usr/lib/
|
||||
mkdir -p "$CACHE_DIR/lib"
|
||||
found=0
|
||||
for f in $(find . -name 'libnccl.so.*' -not -type d 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/"
|
||||
found=$((found + 1))
|
||||
done
|
||||
|
||||
[ "$found" -gt 0 ] || { echo "ERROR: libnccl.so.* not found in package"; exit 1; }
|
||||
|
||||
# Create soname symlinks: libnccl.so.2 -> libnccl.so.<full>, libnccl.so -> libnccl.so.2
|
||||
versioned=$(ls "$CACHE_DIR/lib/libnccl.so."[0-9][0-9.]* 2>/dev/null | head -1)
|
||||
if [ -n "$versioned" ]; then
|
||||
base=$(basename "$versioned")
|
||||
ln -sf "$base" "$CACHE_DIR/lib/libnccl.so.2" 2>/dev/null || true
|
||||
ln -sf "libnccl.so.2" "$CACHE_DIR/lib/libnccl.so" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
echo "=== NCCL extraction complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
ls -lh "$CACHE_DIR/lib/"
|
||||
@@ -10,7 +10,7 @@
|
||||
# Output layout:
|
||||
# $CACHE_DIR/modules/ — nvidia*.ko files
|
||||
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
|
||||
|
||||
set -e
|
||||
|
||||
@@ -46,7 +46,8 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ]; then
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||
@@ -129,8 +130,17 @@ else
|
||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||
fi
|
||||
|
||||
# Copy ALL userspace library files
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
# Copy ALL userspace library files.
|
||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
@@ -147,7 +157,14 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
|
||||
@@ -1,50 +1,381 @@
|
||||
#!/bin/sh
|
||||
# build.sh — build bee ISO (Debian 12 / live-build)
|
||||
#
|
||||
# Single build script. Produces a bootable live ISO with SSH access, TUI, NVIDIA drivers.
|
||||
#
|
||||
# Run on Debian 12 builder VM as root after setup-builder.sh.
|
||||
# Usage:
|
||||
# sh iso/builder/build.sh [--authorized-keys /path/to/authorized_keys]
|
||||
# build.sh — internal ISO build entrypoint executed inside the builder container.
|
||||
|
||||
set -e
|
||||
|
||||
if [ "${BEE_CONTAINER_BUILD:-0}" != "1" ]; then
|
||||
echo "build.sh must run inside iso/builder/build-in-container.sh" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
||||
DIST_DIR="${REPO_ROOT}/dist"
|
||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
AUTH_KEYS=""
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
|
||||
# parse args
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
||||
*) echo "unknown arg: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$BEE_GPU_VENDOR" in
|
||||
nvidia|amd|nogpu) ;;
|
||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
||||
|
||||
export BEE_GPU_VENDOR
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
|
||||
# Allow git to read the bind-mounted repo (different UID inside container).
|
||||
git config --global safe.directory "${REPO_ROOT}"
|
||||
mkdir -p "${DIST_DIR}"
|
||||
mkdir -p "${CACHE_ROOT}"
|
||||
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
|
||||
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
||||
export GOCACHE GOMODCACHE
|
||||
|
||||
echo "=== bee ISO build ==="
|
||||
resolve_audit_version() {
|
||||
if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
|
||||
echo "${BEE_AUDIT_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
if [ -z "${tag}" ]; then
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
fi
|
||||
case "${tag}" in
|
||||
audit/v*)
|
||||
echo "${tag#audit/v}"
|
||||
return 0
|
||||
;;
|
||||
v*)
|
||||
echo "${tag#v}"
|
||||
return 0
|
||||
;;
|
||||
"")
|
||||
;;
|
||||
*)
|
||||
echo "${tag}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -n "${AUDIT_VERSION:-}" ]; then
|
||||
echo "${AUDIT_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
date +%Y%m%d
|
||||
}
|
||||
|
||||
# ISO image versioned separately from the audit binary (iso/v* tags).
|
||||
resolve_iso_version() {
|
||||
if [ -n "${BEE_ISO_VERSION:-}" ]; then
|
||||
echo "${BEE_ISO_VERSION}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Plain v* tags (e.g. v2.7) take priority — this is the current tagging scheme
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
v*)
|
||||
echo "${tag#v}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Legacy iso/v* tags fallback
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'iso/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
iso/v*)
|
||||
echo "${tag#iso/v}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Fall back to audit version so the name is still meaningful
|
||||
resolve_audit_version
|
||||
}
|
||||
|
||||
iso_list_files() {
|
||||
iso_path="$1"
|
||||
|
||||
if command -v bsdtar >/dev/null 2>&1; then
|
||||
bsdtar -tf "$iso_path"
|
||||
return $?
|
||||
fi
|
||||
|
||||
if command -v xorriso >/dev/null 2>&1; then
|
||||
xorriso -indev "$iso_path" -find / -type f -print 2>/dev/null | sed 's#^/##'
|
||||
return $?
|
||||
fi
|
||||
|
||||
return 127
|
||||
}
|
||||
|
||||
iso_extract_file() {
|
||||
iso_path="$1"
|
||||
iso_member="$2"
|
||||
|
||||
if command -v bsdtar >/dev/null 2>&1; then
|
||||
bsdtar -xOf "$iso_path" "$iso_member"
|
||||
return $?
|
||||
fi
|
||||
|
||||
if command -v xorriso >/dev/null 2>&1; then
|
||||
xorriso -osirrox on -indev "$iso_path" -cat "/$iso_member" 2>/dev/null
|
||||
return $?
|
||||
fi
|
||||
|
||||
return 127
|
||||
}
|
||||
|
||||
require_iso_reader() {
|
||||
command -v bsdtar >/dev/null 2>&1 && return 0
|
||||
command -v xorriso >/dev/null 2>&1 && return 0
|
||||
memtest_fail "ISO reader is required for validation/debug (expected bsdtar or xorriso)" "${1:-}"
|
||||
}
|
||||
|
||||
dump_memtest_debug() {
|
||||
phase="$1"
|
||||
lb_dir="${2:-}"
|
||||
iso_path="${3:-}"
|
||||
phase_slug="$(printf '%s' "${phase}" | tr ' /' '__')"
|
||||
memtest_log="${LOG_DIR:-}/memtest-${phase_slug}.log"
|
||||
|
||||
(
|
||||
echo "=== memtest debug: ${phase} ==="
|
||||
|
||||
echo "-- auto/config --"
|
||||
if [ -f "${BUILDER_DIR}/auto/config" ]; then
|
||||
grep -n -- '--memtest' "${BUILDER_DIR}/auto/config" || echo " (no --memtest line found)"
|
||||
else
|
||||
echo " (missing ${BUILDER_DIR}/auto/config)"
|
||||
fi
|
||||
|
||||
echo "-- source bootloader templates --"
|
||||
for cfg in \
|
||||
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
|
||||
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
|
||||
if [ -f "$cfg" ]; then
|
||||
echo " file: $cfg"
|
||||
grep -n 'Memory Test\|memtest' "$cfg" || echo " (no memtest lines)"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
|
||||
echo "-- live-build workdir package lists --"
|
||||
for pkg in \
|
||||
"$lb_dir/config/package-lists/bee.list.chroot" \
|
||||
"$lb_dir/config/package-lists/bee-gpu.list.chroot" \
|
||||
"$lb_dir/config/package-lists/bee-nvidia.list.chroot"; do
|
||||
if [ -f "$pkg" ]; then
|
||||
echo " file: $pkg"
|
||||
grep -n 'memtest' "$pkg" || echo " (no memtest lines)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "-- live-build chroot/boot --"
|
||||
if [ -d "$lb_dir/chroot/boot" ]; then
|
||||
find "$lb_dir/chroot/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
|
||||
else
|
||||
echo " (missing $lb_dir/chroot/boot)"
|
||||
fi
|
||||
|
||||
echo "-- live-build binary/boot --"
|
||||
if [ -d "$lb_dir/binary/boot" ]; then
|
||||
find "$lb_dir/binary/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
|
||||
else
|
||||
echo " (missing $lb_dir/binary/boot)"
|
||||
fi
|
||||
|
||||
echo "-- live-build package cache --"
|
||||
if [ -d "$lb_dir/cache/packages.chroot" ]; then
|
||||
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
|
||||
else
|
||||
echo " (missing $lb_dir/cache/packages.chroot)"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
|
||||
echo "-- ISO memtest files --"
|
||||
iso_list_files "$iso_path" | grep 'memtest' | sed 's/^/ /' || echo " (no memtest files in ISO)"
|
||||
|
||||
echo "-- ISO GRUB memtest lines --"
|
||||
iso_extract_file "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in boot/grub/grub.cfg)"
|
||||
|
||||
echo "-- ISO isolinux memtest lines --"
|
||||
iso_extract_file "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in isolinux/live.cfg)"
|
||||
fi
|
||||
|
||||
echo "=== end memtest debug: ${phase} ==="
|
||||
) | {
|
||||
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ]; then
|
||||
tee "${memtest_log}"
|
||||
else
|
||||
cat
|
||||
fi
|
||||
}
|
||||
}
|
||||
|
||||
memtest_fail() {
|
||||
msg="$1"
|
||||
iso_path="${2:-}"
|
||||
echo "ERROR: ${msg}" >&2
|
||||
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
validate_iso_memtest() {
|
||||
iso_path="$1"
|
||||
echo "=== validating memtest in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
|
||||
require_iso_reader "$iso_path"
|
||||
|
||||
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
|
||||
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
|
||||
}
|
||||
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
|
||||
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_extract_file "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
|
||||
iso_extract_file "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
|
||||
|
||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
||||
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
||||
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
||||
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
|
||||
}
|
||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
||||
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
||||
memtest_fail "isolinux memtest path is missing" "$iso_path"
|
||||
}
|
||||
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
echo "=== memtest validation OK ==="
|
||||
}
|
||||
|
||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
|
||||
LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||
ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
|
||||
LOG_OUT="${LOG_DIR}/build.log"
|
||||
|
||||
cleanup_build_log() {
|
||||
status="${1:-$?}"
|
||||
trap - EXIT INT TERM HUP
|
||||
|
||||
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
|
||||
BUILD_LOG_ACTIVE=0
|
||||
exec 1>&3 2>&4
|
||||
exec 3>&- 4>&-
|
||||
if [ -n "${BUILD_TEE_PID:-}" ]; then
|
||||
wait "${BUILD_TEE_PID}" 2>/dev/null || true
|
||||
fi
|
||||
rm -f "${BUILD_LOG_PIPE}"
|
||||
fi
|
||||
|
||||
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
|
||||
rm -f "${LOG_ARCHIVE}"
|
||||
tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
exit "${status}"
|
||||
}
|
||||
|
||||
start_build_log() {
|
||||
command -v tee >/dev/null 2>&1 || {
|
||||
echo "ERROR: tee is required for build logging" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
rm -rf "${LOG_DIR}"
|
||||
rm -f "${LOG_ARCHIVE}"
|
||||
mkdir -p "${LOG_DIR}"
|
||||
BUILD_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-build-log.XXXXXX")"
|
||||
mkfifo "${BUILD_LOG_PIPE}"
|
||||
|
||||
exec 3>&1 4>&2
|
||||
tee "${LOG_OUT}" < "${BUILD_LOG_PIPE}" &
|
||||
BUILD_TEE_PID=$!
|
||||
exec > "${BUILD_LOG_PIPE}" 2>&1
|
||||
BUILD_LOG_ACTIVE=1
|
||||
|
||||
trap 'cleanup_build_log "$?"' EXIT INT TERM HUP
|
||||
|
||||
echo "=== build log dir: ${LOG_DIR} ==="
|
||||
echo "=== build log: ${LOG_OUT} ==="
|
||||
echo "=== build log archive: ${LOG_ARCHIVE} ==="
|
||||
}
|
||||
|
||||
start_build_log
|
||||
|
||||
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
|
||||
# If headers for the detected ABI are not yet installed (kernel updated since image build),
|
||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||
apt-get update -qq
|
||||
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||
| head -1)
|
||||
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||
fi
|
||||
|
||||
# Export detected ABI so that auto/config can pin the exact kernel package
|
||||
# (prevents NVIDIA module/kernel mismatch if linux-image-amd64 meta-package
|
||||
# gets updated between build.sh start and lb build chroot step)
|
||||
export BEE_KERNEL_ABI="${DEBIAN_KERNEL_ABI}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
||||
echo "=== installing linux-headers-${KVER} (kernel updated since image build) ==="
|
||||
apt-get install -y "linux-headers-${KVER}"
|
||||
fi
|
||||
|
||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
|
||||
echo "=== syncing git submodules ==="
|
||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||
|
||||
# --- compile bee binary (static, Linux amd64) ---
|
||||
# Shared between variants — built once, reused on second pass.
|
||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
NEED_BUILD=1
|
||||
if [ -f "$BEE_BIN" ]; then
|
||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||
@@ -56,7 +387,7 @@ if [ "$NEED_BUILD" = "1" ]; then
|
||||
cd "${REPO_ROOT}/audit"
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||
go build \
|
||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION:-$(date +%Y%m%d)}" \
|
||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
|
||||
-o "$BEE_BIN" \
|
||||
./cmd/bee
|
||||
echo "binary: $BEE_BIN"
|
||||
@@ -74,34 +405,82 @@ else
|
||||
echo "=== bee binary up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-burn worker ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_BURN_WORKER_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_BURN_WORKER_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay ==="
|
||||
rm -rf "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
rsync -a "${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
--exclude='.build/' \
|
||||
--exclude='*.iso' \
|
||||
--exclude='*.packages' \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||
|
||||
# Share deb package cache across variants.
|
||||
# Restore: populate work dir cache from shared cache before build.
|
||||
# Persist: sync back after build (done after lb build below).
|
||||
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
|
||||
mkdir -p "${LB_PKG_CACHE}"
|
||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||
rsync -a --delete "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
|
||||
elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; then
|
||||
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
|
||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||
fi
|
||||
|
||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||
rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
rm -rf \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
|
||||
# Remove NVIDIA-specific overlay files for non-nvidia variants
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||
fi
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||
@@ -140,8 +519,15 @@ fi
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee" "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${GPU_BURN_WORKER_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
@@ -158,57 +544,171 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
fi
|
||||
done
|
||||
|
||||
# --- build NVIDIA kernel modules ---
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||
|
||||
# Inject nvidia-smi and libnvidia-ml
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
# Inject nvidia-smi and libnvidia-ml
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors"
|
||||
printf 'libnvidia-opencl.so.1\n' > "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors/nvidia.icd"
|
||||
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||
fi
|
||||
|
||||
# --- build / download NCCL ---
|
||||
echo ""
|
||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by the bee-gpu-burn worker tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}" \
|
||||
"${NVCC_VERSION}" \
|
||||
"${DEBIAN_VERSION}"
|
||||
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
echo ""
|
||||
echo "=== building john jumbo ${JOHN_JUMBO_COMMIT} ==="
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/john"
|
||||
echo "=== john injected ==="
|
||||
fi
|
||||
|
||||
# --- embed build metadata ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||
else
|
||||
GPU_VERSION_LINE=""
|
||||
GPU_BUILD_INFO="nogpu"
|
||||
fi
|
||||
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${AUDIT_VERSION}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION}
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
${GPU_VERSION_LINE}
|
||||
EOF
|
||||
|
||||
# Write GPU vendor marker for hooks
|
||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||
|
||||
# Patch motd with build info
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
||||
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
||||
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
||||
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
||||
fi
|
||||
|
||||
# --- copy variant-specific package list, remove all other variant lists ---
|
||||
# live-build picks up ALL .list.chroot files — delete other variants to avoid conflicts.
|
||||
cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
rm -f "${BUILD_WORK_DIR}/config/package-lists/bee-nvidia.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-amd.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-nogpu.list.chroot"
|
||||
|
||||
# --- remove archives for the other vendor(s) ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||
else
|
||||
# nogpu: remove both
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||
fi
|
||||
|
||||
# --- substitute version placeholders in package list and archive ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
sed -i \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
sed -i \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
||||
-e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \
|
||||
-e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \
|
||||
-e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \
|
||||
-e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \
|
||||
-e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \
|
||||
-e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \
|
||||
-e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then
|
||||
sed -i \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- sync overlay into live-build includes.chroot ---
|
||||
LB_DIR="${BUILD_WORK_DIR}"
|
||||
LB_INCLUDES="${LB_DIR}/config/includes.chroot"
|
||||
@@ -223,20 +723,33 @@ fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (live-build) ==="
|
||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
||||
|
||||
# Export for auto/config
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
lb clean 2>&1 | tail -3
|
||||
lb config 2>&1 | tail -5
|
||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
lb build 2>&1
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
# This allows the second variant to reuse all downloaded packages.
|
||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||
rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
|
||||
echo "=== package cache synced to ${LB_PKG_CACHE} ==="
|
||||
fi
|
||||
|
||||
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${AUDIT_VERSION}-amd64.iso"
|
||||
if [ -f "$ISO_RAW" ]; then
|
||||
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done ==="
|
||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
if command -v stat >/dev/null 2>&1; then
|
||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user