Compare commits
190 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 29179917c3 | |||
| be4b439804 | |||
| 749fc8a94d | |||
| 6112094d45 | |||
| e9a2bc9f9d | |||
|
|
7a8f884664 | ||
|
|
8bf8dfa45b | ||
|
|
6a22199aff | ||
|
|
ddb2bb5d1c | ||
|
|
aa284ae754 | ||
|
|
8512098174 | ||
|
|
6b5d22c194 | ||
|
|
a35e90a93e | ||
|
|
1ced81707f | ||
|
|
679aeb9947 | ||
|
|
647e99b697 | ||
|
|
4af997f436 | ||
|
|
6caace0cc0 | ||
|
|
5f0103635b | ||
|
|
84a2551dc0 | ||
|
|
1cfabc9230 | ||
|
|
5dc711de23 | ||
|
|
ab802719f8 | ||
|
|
a94e8007f8 | ||
| c69bf07b27 | |||
| b3cf8e3893 | |||
| 17118298bd | |||
| 65bcc9ce81 | |||
| 0cdfbc5875 | |||
| cf9b54b600 | |||
| 0bfb3fe954 | |||
| 3053cb0710 | |||
| 2038489961 | |||
| e35484013e | |||
| 2cdf034bb0 | |||
| b89580c24d | |||
| df1385d3d6 | |||
| f8cd9a7376 | |||
| d52ec67f8f | |||
| 61c7abaa80 | |||
| d60f7758ba | |||
| 52c3a24b76 | |||
| 028bb30333 | |||
| 7d64e5d215 | |||
| 51b721aeb3 | |||
| bac89bb6e5 | |||
| 7a618da1f9 | |||
| 64ae1c0ff0 | |||
| 49050ca717 | |||
| 5ba72ab315 | |||
| 63363e9629 | |||
|
|
5285c0d101 | ||
|
|
dca4afb8d0 | ||
|
|
b4280941f5 | ||
|
|
f74976ec4c | ||
|
|
18e24a9aa5 | ||
|
|
e306250da7 | ||
|
|
c5b2081ac9 | ||
| 434528083e | |||
| 30aa30cd67 | |||
| 4f76e1de21 | |||
| 3732e64a4a | |||
| 0d925299ff | |||
| a8d5e019a5 | |||
| 72ec086568 | |||
| 7a0b0934df | |||
| d8ca0dca2c | |||
| d90250f80a | |||
| 8d6eaef5de | |||
| 732bf4cbab | |||
| fa6d905a10 | |||
|
|
5c1862ce4c | ||
|
|
b65ef2ea1d | ||
|
|
533d703c97 | ||
|
|
04eb4b5a6d | ||
|
|
4110dbf8a6 | ||
|
|
7237e4d3e4 | ||
|
|
ab3ad77cd6 | ||
|
|
cd9e2cbe13 | ||
|
|
0317dc58fd | ||
|
|
1c5cb45698 | ||
|
|
090b92ca73 | ||
|
|
2dccbc010c | ||
| e84c69d360 | |||
| c80a39e7ac | |||
| a5e0261ff2 | |||
| ee422ede3c | |||
| d560b2fead | |||
| 3cf2e9c9dc | |||
| 19dbabd71d | |||
| a6a07f2626 | |||
| f87461ee4a | |||
| a636146dbd | |||
|
|
303de2df04 | ||
|
|
95124d228f | ||
|
|
54338dbae5 | ||
|
|
2be7ae6d28 | ||
|
|
b1a5035edd | ||
|
|
8fc986c933 | ||
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 | |||
| 0fb8f2777f | |||
| bf182daa89 | |||
| 457ea1cf04 | |||
| bf6ecab4f0 | |||
| 02e44b1172 | |||
| 2ceaa0d0ca | |||
| 9482ba20a2 | |||
| 813e2f86a9 | |||
| 58a6da9b44 | |||
| f4a19c0a00 | |||
| 9e3dcf9b4d | |||
| 098e19f760 | |||
| e16d0f34b5 | |||
|
|
525ed8b8fc | ||
|
|
4f94ebcb2c | ||
|
|
05c1fde233 | ||
| 825ef6b98a | |||
| ba16021cdb | |||
|
|
bb1218ddd4 | ||
|
|
65faae8ede | ||
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b | ||
|
|
3f41a026ca | ||
|
|
0ee4f46537 | ||
| 8db40b098a | |||
| 16e7ae00e7 | |||
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d | |||
|
|
0a4bb596f6 | ||
|
|
531d1ca366 | ||
|
|
93cfa78e8c | ||
|
|
1358485f2b | ||
| 8fe20ba678 | |||
| d973231f37 | |||
| f5d175f488 | |||
| fa00667750 | |||
|
|
c7d2816a7f | ||
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 | |||
| 5b9015451e | |||
| d1a6863ceb | |||
| f9aa05de8e | |||
| a9ccea8cca | |||
| fc5c985fb5 | |||
| 5eb3baddb4 | |||
| a6ac13b5d3 | |||
| 4003cb7676 | |||
| 2875313ba0 | |||
| f1621efee4 | |||
| 4461249cc3 | |||
| e609fbbc26 | |||
| cc2b49ea41 | |||
| 33e0a5bef2 | |||
| 38e79143eb | |||
| 25af2df23a | |||
| 20abff7f90 | |||
| a14ec8631c | |||
| f58c7e58d3 | |||
| bf47c8dbd2 | |||
| 143b7dca5d | |||
| 9826d437a5 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -2,3 +2,5 @@
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
dist/
|
dist/
|
||||||
iso/out/
|
iso/out/
|
||||||
|
build-cache/
|
||||||
|
audit/bee
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
LISTEN ?= :8080
|
LISTEN ?= :8080
|
||||||
AUDIT_PATH ?=
|
AUDIT_PATH ?=
|
||||||
|
EXPORT_DIR ?= $(CURDIR)/.tmp/export
|
||||||
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||||
GO_LDFLAGS := -X main.Version=$(VERSION)
|
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||||
|
|
||||||
RUN_ARGS := web --listen $(LISTEN)
|
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
|
||||||
ifneq ($(AUDIT_PATH),)
|
ifneq ($(AUDIT_PATH),)
|
||||||
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||||
endif
|
endif
|
||||||
@@ -11,6 +12,7 @@ endif
|
|||||||
.PHONY: run build test
|
.PHONY: run build test
|
||||||
|
|
||||||
run:
|
run:
|
||||||
|
mkdir -p $(EXPORT_DIR)
|
||||||
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||||
|
|
||||||
build:
|
build:
|
||||||
|
|||||||
@@ -2,11 +2,14 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
"runtime/debug"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -29,10 +32,19 @@ func main() {
|
|||||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||||
}
|
}
|
||||||
|
|
||||||
func run(args []string, stdout, stderr io.Writer) int {
|
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||||
Level: slog.LevelInfo,
|
Level: slog.LevelInfo,
|
||||||
})))
|
})))
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
slog.Error("fatal panic",
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
exitCode = 1
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
printRootUsage(stderr)
|
printRootUsage(stderr)
|
||||||
@@ -56,8 +68,14 @@ func run(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runSupportBundle(args[1:], stdout, stderr)
|
return runSupportBundle(args[1:], stdout, stderr)
|
||||||
case "web":
|
case "web":
|
||||||
return runWeb(args[1:], stdout, stderr)
|
return runWeb(args[1:], stdout, stderr)
|
||||||
|
case "blackbox":
|
||||||
|
return runBlackbox(args[1:], stdout, stderr)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT(args[1:], stdout, stderr)
|
return runSAT(args[1:], stdout, stderr)
|
||||||
|
case "benchmark":
|
||||||
|
return runBenchmark(args[1:], stdout, stderr)
|
||||||
|
case "bee-worker":
|
||||||
|
return runBeeWorker(args[1:], stdout, stderr)
|
||||||
case "version", "--version", "-version":
|
case "version", "--version", "-version":
|
||||||
fmt.Fprintln(stdout, Version)
|
fmt.Fprintln(stdout, Version)
|
||||||
return 0
|
return 0
|
||||||
@@ -74,8 +92,11 @@ func printRootUsage(w io.Writer) {
|
|||||||
bee preflight --output stdout|file:<path>
|
bee preflight --output stdout|file:<path>
|
||||||
bee export --target <device>
|
bee export --target <device>
|
||||||
bee support-bundle --output stdout|file:<path>
|
bee support-bundle --output stdout|file:<path>
|
||||||
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||||
|
bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
|
||||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||||
|
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||||
|
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||||
bee version
|
bee version
|
||||||
bee help [command]`)
|
bee help [command]`)
|
||||||
}
|
}
|
||||||
@@ -92,8 +113,14 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||||
case "web":
|
case "web":
|
||||||
return runWeb([]string{"--help"}, stdout, stdout)
|
return runWeb([]string{"--help"}, stdout, stdout)
|
||||||
|
case "blackbox":
|
||||||
|
return runBlackbox([]string{"--help"}, stdout, stdout)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT([]string{"--help"}, stdout, stderr)
|
return runSAT([]string{"--help"}, stdout, stderr)
|
||||||
|
case "benchmark":
|
||||||
|
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||||
|
case "bee-worker":
|
||||||
|
return runBeeWorker([]string{"--help"}, stdout, stderr)
|
||||||
case "version":
|
case "version":
|
||||||
fmt.Fprintln(stdout, "usage: bee version")
|
fmt.Fprintln(stdout, "usage: bee version")
|
||||||
return 0
|
return 0
|
||||||
@@ -280,7 +307,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||||
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
|
||||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||||
title := fs.String("title", "Bee Hardware Audit", "page title")
|
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||||
fs.Usage = func() {
|
fs.Usage = func() {
|
||||||
@@ -319,6 +346,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBlackbox(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||||
|
statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
|
||||||
|
if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
|
||||||
|
slog.Error("run blackbox", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||||
@@ -366,9 +420,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||||
}
|
}
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||||
case "storage":
|
case "storage":
|
||||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
dur := *duration
|
dur := *duration
|
||||||
if dur <= 0 {
|
if dur <= 0 {
|
||||||
@@ -383,3 +437,107 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
slog.Info("sat archive written", "target", target, "path", archive)
|
slog.Info("sat archive written", "target", target, "path", archive)
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||||
|
if len(args) == 0 {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||||
|
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
target := args[0]
|
||||||
|
if target != "nvidia" {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
|
||||||
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
|
||||||
|
devices := fs.String("devices", "", "comma-separated GPU indices to include")
|
||||||
|
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
|
||||||
|
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
|
||||||
|
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
|
||||||
|
if err := fs.Parse(args[1:]); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
includeIndices, err := parseBenchmarkIndexCSV(*devices)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
application := app.New(platform.New())
|
||||||
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||||
|
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: *profile,
|
||||||
|
SizeMB: *sizeMB,
|
||||||
|
GPUIndices: includeIndices,
|
||||||
|
ExcludeGPUIndices: excludeIndices,
|
||||||
|
RunNCCL: !*skipNCCL,
|
||||||
|
}, logLine)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("run benchmark", "target", target, "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
slog.Info("benchmark archive written", "target", target, "path", archive)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func runBeeWorker(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
|
||||||
|
taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
|
||||||
|
fs.PrintDefaults()
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for _, part := range strings.Split(raw, ",") {
|
||||||
|
part = strings.TrimSpace(part)
|
||||||
|
if part == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(part)
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return nil, fmt.Errorf("bad gpu index %q", part)
|
||||||
|
}
|
||||||
|
indices = append(indices, value)
|
||||||
|
}
|
||||||
|
return indices, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,22 +5,18 @@ go 1.25.0
|
|||||||
replace reanimator/chart => ../internal/chart
|
replace reanimator/chart => ../internal/chart
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/go-analyze/charts v0.5.26
|
modernc.org/sqlite v1.48.0
|
||||||
reanimator/chart v0.0.0-00010101000000-000000000000
|
reanimator/chart v0.0.0-00010101000000-000000000000
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
github.com/go-analyze/bulk v0.1.3 // indirect
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect
|
|
||||||
github.com/google/uuid v1.6.0 // indirect
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
golang.org/x/image v0.24.0 // indirect
|
|
||||||
golang.org/x/sys v0.42.0 // indirect
|
golang.org/x/sys v0.42.0 // indirect
|
||||||
modernc.org/libc v1.70.0 // indirect
|
modernc.org/libc v1.72.0 // indirect
|
||||||
modernc.org/mathutil v1.7.1 // indirect
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
modernc.org/memory v1.11.0 // indirect
|
modernc.org/memory v1.11.0 // indirect
|
||||||
modernc.org/sqlite v1.48.0 // indirect
|
|
||||||
)
|
)
|
||||||
|
|||||||
50
audit/go.sum
50
audit/go.sum
@@ -1,37 +1,51 @@
|
|||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
|
||||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
github.com/go-analyze/bulk v0.1.3 h1:pzRdBqzHDAT9PyROt0SlWE0YqPtdmTcEpIJY0C3vF0c=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||||
github.com/go-analyze/bulk v0.1.3/go.mod h1:afon/KtFJYnekIyN20H/+XUvcLFjE8sKR1CfpqfClgM=
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||||
github.com/go-analyze/charts v0.5.26 h1:rSwZikLQuFX6cJzwI8OAgaWZneG1kDYxD857ms00ZxY=
|
|
||||||
github.com/go-analyze/charts v0.5.26/go.mod h1:s1YvQhjiSwtLx1f2dOKfiV9x2TT49nVSL6v2rlRpTbY=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g=
|
|
||||||
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
|
|
||||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
|
||||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||||
golang.org/x/image v0.24.0 h1:AN7zRgVsbvmTfNyqIbbOraYL8mSwcKncEj8ofjgzcMQ=
|
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||||
golang.org/x/image v0.24.0/go.mod h1:4b/ITuLfqYq1hqZcjofwctIhi7sZh2WaCjvsBNjjya8=
|
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||||
modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
|
modernc.org/cc/v4 v4.27.3 h1:uNCgn37E5U09mTv1XgskEVUJ8ADKpmFMPxzGJ0TSo+U=
|
||||||
modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
|
modernc.org/cc/v4 v4.27.3/go.mod h1:3YjcbCqhoTTHPycJDRl2WZKKFj0nwcOIPBfEZK0Hdk8=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4 h1:L5OB8rpEX4ZsXEQwGozRfJyJSFHbbNVOoQ59DU9/KuU=
|
||||||
|
modernc.org/ccgo/v4 v4.32.4/go.mod h1:lY7f+fiTDHfcv6YlRgSkxYfhs+UvOEEzj49jAn2TOx0=
|
||||||
|
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
|
||||||
|
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
|
||||||
|
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||||
|
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||||
|
modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
|
||||||
|
modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||||
|
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||||
|
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||||
|
modernc.org/libc v1.72.0 h1:IEu559v9a0XWjw0DPoVKtXpO2qt5NVLAnFaBbjq+n8c=
|
||||||
|
modernc.org/libc v1.72.0/go.mod h1:tTU8DL8A+XLVkEY3x5E/tO7s2Q/q42EtnNWda/L5QhQ=
|
||||||
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||||
|
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||||
|
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||||
|
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||||
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
modernc.org/sqlite v1.48.0 h1:ElZyLop3Q2mHYk5IFPPXADejZrlHu7APbpB0sF78bq4=
|
||||||
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
modernc.org/sqlite v1.48.0/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
|
||||||
|
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||||
|
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||||
|
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||||
|
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||||
|
|||||||
@@ -19,17 +19,22 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
DefaultExportDir = "/appdata/bee/export"
|
DefaultExportDir = "/appdata/bee/export"
|
||||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||||
|
DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
|
||||||
|
DefaultBeeBenchAutotuneDir = DefaultBeeBenchBaseDir + "/autotune"
|
||||||
|
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
|
||||||
|
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
|
||||||
|
DefaultBeeBenchPowerSourceConfigPath = DefaultBeeBenchBaseDir + "/power-source-autotune.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
type App struct {
|
type App struct {
|
||||||
@@ -83,6 +88,7 @@ type installer interface {
|
|||||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||||
IsLiveMediaInRAM() bool
|
IsLiveMediaInRAM() bool
|
||||||
LiveBootSource() platform.LiveBootSource
|
LiveBootSource() platform.LiveBootSource
|
||||||
|
LiveMediaRAMState() platform.LiveMediaRAMState
|
||||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,6 +113,10 @@ func (a *App) LiveBootSource() platform.LiveBootSource {
|
|||||||
return a.installer.LiveBootSource()
|
return a.installer.LiveBootSource()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) LiveMediaRAMState() platform.LiveMediaRAMState {
|
||||||
|
return a.installer.LiveMediaRAMState()
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||||
}
|
}
|
||||||
@@ -114,9 +124,19 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
|||||||
type satRunner interface {
|
type satRunner interface {
|
||||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
ResetNvidiaGPU(index int) (string, error)
|
||||||
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||||
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
@@ -129,7 +149,7 @@ type satRunner interface {
|
|||||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -181,6 +201,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
|
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
@@ -195,10 +216,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return path, nil
|
return path, nil
|
||||||
@@ -223,10 +241,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return path, nil
|
return path, nil
|
||||||
@@ -292,7 +307,7 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
|||||||
}
|
}
|
||||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||||
data, err := os.ReadFile(DefaultAuditJSONPath)
|
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -520,6 +535,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return a.sat.ListNvidiaGPUs()
|
return a.sat.ListNvidiaGPUs()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
return a.sat.ListNvidiaGPUStatuses()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||||
|
out, err := a.sat.ResetNvidiaGPU(index)
|
||||||
|
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
@@ -532,10 +556,106 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
|||||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPerfDir
|
||||||
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPowerDir
|
||||||
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchAutotuneDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||||
|
}
|
||||||
|
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||||
|
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||||
|
if err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
@@ -544,14 +664,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -576,14 +696,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
@@ -670,8 +790,15 @@ func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platfo
|
|||||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||||
body := "Results: " + path
|
body := "Results: " + path
|
||||||
if err != nil && err != context.Canceled {
|
if err != nil && err != context.Canceled {
|
||||||
body += "\nERROR: " + err.Error()
|
body += "\nERROR: " + err.Error()
|
||||||
@@ -868,6 +995,41 @@ func bodyOr(body, fallback string) string {
|
|||||||
return body
|
return body
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||||
|
// component-status DB so they are visible in the Hardware Summary card.
|
||||||
|
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||||
|
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||||
|
if db == nil || len(psus) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const source = "audit:ipmi"
|
||||||
|
worstStatus := "OK"
|
||||||
|
for _, psu := range psus {
|
||||||
|
if psu.Status == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
slot := "?"
|
||||||
|
if psu.Slot != nil {
|
||||||
|
slot = *psu.Slot
|
||||||
|
}
|
||||||
|
st := *psu.Status
|
||||||
|
detail := ""
|
||||||
|
if psu.ErrorDescription != nil {
|
||||||
|
detail = *psu.ErrorDescription
|
||||||
|
}
|
||||||
|
db.Record("psu:"+slot, source, st, detail)
|
||||||
|
switch st {
|
||||||
|
case "Critical":
|
||||||
|
worstStatus = "Critical"
|
||||||
|
case "Warning":
|
||||||
|
if worstStatus != "Critical" {
|
||||||
|
worstStatus = "Warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
db.Record("psu:all", source, worstStatus, "")
|
||||||
|
}
|
||||||
|
|
||||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -886,6 +1048,12 @@ func latestSATSummaries() []string {
|
|||||||
prefix string
|
prefix string
|
||||||
}{
|
}{
|
||||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||||
|
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||||
|
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||||
|
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||||
|
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||||
|
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||||
|
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||||
{label: "Memory SAT", prefix: "memory-"},
|
{label: "Memory SAT", prefix: "memory-"},
|
||||||
{label: "Storage SAT", prefix: "storage-"},
|
{label: "Storage SAT", prefix: "storage-"},
|
||||||
{label: "CPU SAT", prefix: "cpu-"},
|
{label: "CPU SAT", prefix: "cpu-"},
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
@@ -120,15 +121,26 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runMemoryFn func(string) (string, error)
|
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runStorageFn func(string) (string, error)
|
runNvidiaAutotuneFn func(string, platform.NvidiaBenchmarkOptions, string) (string, error)
|
||||||
runCPUFn func(string, int) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
detectVendorFn func() string
|
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
runAMDPackFn func(string) (string, error)
|
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||||
|
runNCCLFn func(string, []int) (string, error)
|
||||||
|
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||||
|
runMemoryFn func(string) (string, error)
|
||||||
|
runStorageFn func(string) (string, error)
|
||||||
|
runCPUFn func(string, int) (string, error)
|
||||||
|
detectVendorFn func() string
|
||||||
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
|
runAMDPackFn func(string) (string, error)
|
||||||
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
|
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||||
|
resetNvidiaGPUFn func(int) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||||
@@ -139,6 +151,62 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaBenchmarkFn != nil {
|
||||||
|
return f.runNvidiaBenchmarkFn(baseDir, opts)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPowerBenchFn != nil {
|
||||||
|
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPowerSourceAutotune(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaAutotuneFn != nil {
|
||||||
|
return f.runNvidiaAutotuneFn(baseDir, opts, benchmarkKind)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaTargetedStressFn != nil {
|
||||||
|
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaComputeFn != nil {
|
||||||
|
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPowerFn != nil {
|
||||||
|
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPulseFn != nil {
|
||||||
|
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaBandwidthFn != nil {
|
||||||
|
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaStressFn != nil {
|
if f.runNvidiaStressFn != nil {
|
||||||
return f.runNvidiaStressFn(baseDir, opts)
|
return f.runNvidiaStressFn(baseDir, opts)
|
||||||
@@ -153,11 +221,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
if f.listNvidiaGPUStatusesFn != nil {
|
||||||
|
return f.listNvidiaGPUStatusesFn()
|
||||||
|
}
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||||
|
if f.resetNvidiaGPUFn != nil {
|
||||||
|
return f.resetNvidiaGPUFn(index)
|
||||||
|
}
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||||
return f.runMemoryFn(baseDir)
|
return f.runMemoryFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||||
return f.runStorageFn(baseDir)
|
return f.runStorageFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -215,10 +297,43 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
|||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNCCLTests(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNCCLFn != nil {
|
||||||
|
return f.runNCCLFn(baseDir, gpuIndices)
|
||||||
|
}
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunNCCLTestsPassesSelectedGPUs(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
var gotBaseDir string
|
||||||
|
var gotGPUIndices []int
|
||||||
|
a := &App{
|
||||||
|
sat: fakeSAT{
|
||||||
|
runNCCLFn: func(baseDir string, gpuIndices []int) (string, error) {
|
||||||
|
gotBaseDir = baseDir
|
||||||
|
gotGPUIndices = append([]int(nil), gpuIndices...)
|
||||||
|
return "/tmp/nccl-tests.tar.gz", nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
path, err := a.RunNCCLTests(context.Background(), "/tmp/sat", []int{3, 1}, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RunNCCLTests error: %v", err)
|
||||||
|
}
|
||||||
|
if path != "/tmp/nccl-tests.tar.gz" {
|
||||||
|
t.Fatalf("path=%q want %q", path, "/tmp/nccl-tests.tar.gz")
|
||||||
|
}
|
||||||
|
if gotBaseDir != "/tmp/sat" {
|
||||||
|
t.Fatalf("baseDir=%q want %q", gotBaseDir, "/tmp/sat")
|
||||||
|
}
|
||||||
|
if len(gotGPUIndices) != 2 || gotGPUIndices[0] != 3 || gotGPUIndices[1] != 1 {
|
||||||
|
t.Fatalf("gpuIndices=%v want [3 1]", gotGPUIndices)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -478,8 +593,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -516,8 +629,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -579,8 +690,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
oldSATBaseDir := DefaultSATBaseDir
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
@@ -709,6 +818,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Join(exportDir, "bee-bench"), 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json"), []byte(`{"version":1,"updated_at":"2026-04-20T01:02:03Z","selected_source":"sdr_psu_input","reason":"selected lowest relative error"}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run.tar.gz"), []byte("nested sat archive"), 0644); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@@ -736,6 +851,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
tr := tar.NewReader(gzr)
|
tr := tar.NewReader(gzr)
|
||||||
var names []string
|
var names []string
|
||||||
var auditJSON string
|
var auditJSON string
|
||||||
|
var manifest string
|
||||||
for {
|
for {
|
||||||
hdr, err := tr.Next()
|
hdr, err := tr.Next()
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
@@ -752,11 +868,21 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
}
|
}
|
||||||
auditJSON = string(body)
|
auditJSON = string(body)
|
||||||
}
|
}
|
||||||
|
if strings.HasSuffix(hdr.Name, "/manifest.txt") {
|
||||||
|
body, err := io.ReadAll(tr)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read manifest entry: %v", err)
|
||||||
|
}
|
||||||
|
manifest = string(body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, want := range []string{
|
for _, want := range []string{
|
||||||
"/system/ip-link.txt",
|
"/system/ip-link.txt",
|
||||||
"/system/ip-link-stats.txt",
|
"/system/ip-link-stats.txt",
|
||||||
|
"/system/kernel-aer-nvidia.txt",
|
||||||
|
"/system/lspci-nvidia-bridges-vv.txt",
|
||||||
|
"/system/pcie-aer-sysfs.txt",
|
||||||
"/system/ethtool-info.txt",
|
"/system/ethtool-info.txt",
|
||||||
"/system/ethtool-link.txt",
|
"/system/ethtool-link.txt",
|
||||||
"/system/ethtool-module.txt",
|
"/system/ethtool-module.txt",
|
||||||
@@ -792,6 +918,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||||
}
|
}
|
||||||
|
if !contains(manifest, "files:") {
|
||||||
|
t.Fatalf("support bundle manifest missing files section:\n%s", manifest)
|
||||||
|
}
|
||||||
|
if !strings.Contains(manifest, "power_autotune_selected_source=sdr_psu_input") {
|
||||||
|
t.Fatalf("support bundle manifest missing autotune source:\n%s", manifest)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMainBanner(t *testing.T) {
|
func TestMainBanner(t *testing.T) {
|
||||||
|
|||||||
67
audit/internal/app/atomic_write.go
Normal file
67
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
)
|
||||||
|
|
||||||
|
// readFileLimited reads path into memory, refusing files larger than maxBytes.
|
||||||
|
// Prevents OOM on corrupted or unexpectedly large data files.
|
||||||
|
func readFileLimited(path string, maxBytes int64) ([]byte, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
data, err := io.ReadAll(io.LimitReader(f, maxBytes+1))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if int64(len(data)) > maxBytes {
|
||||||
|
return nil, fmt.Errorf("file %s too large (exceeds %d bytes)", path, maxBytes)
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tmpPath := path + ".tmp"
|
||||||
|
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
success := false
|
||||||
|
defer func() {
|
||||||
|
_ = f.Close()
|
||||||
|
if !success {
|
||||||
|
_ = os.Remove(tmpPath)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if _, err := f.Write(data); err != nil {
|
||||||
|
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmpPath, path); err != nil {
|
||||||
|
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||||
|
_ = dir.Sync()
|
||||||
|
_ = dir.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
success = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("seed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("atomicWriteFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read final: %v", err)
|
||||||
|
}
|
||||||
|
if string(raw) != "new\n" {
|
||||||
|
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||||
|
a := &App{
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
return schema.RuntimeHealth{
|
||||||
|
Status: "OK",
|
||||||
|
ExportDir: exportDir,
|
||||||
|
DriverReady: true,
|
||||||
|
CUDAReady: true,
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := a.RunRuntimePreflight("file:" + path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||||
|
}
|
||||||
|
if got != path {
|
||||||
|
t.Fatalf("path=%q want %q", got, path)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read runtime file: %v", err)
|
||||||
|
}
|
||||||
|
var health schema.RuntimeHealth
|
||||||
|
if err := json.Unmarshal(raw, &health); err != nil {
|
||||||
|
t.Fatalf("json unmarshal: %v", err)
|
||||||
|
}
|
||||||
|
if health.Status != "OK" {
|
||||||
|
t.Fatalf("status=%q want OK", health.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
779
audit/internal/app/blackbox.go
Normal file
779
audit/internal/app/blackbox.go
Normal file
@@ -0,0 +1,779 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"crypto/rand"
|
||||||
|
"encoding/hex"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io/fs"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
blackboxMarkerName = ".bee-blackbox"
|
||||||
|
blackboxDiscoverInterval = 2 * time.Second
|
||||||
|
blackboxMinFlushPeriod = 1 * time.Second
|
||||||
|
blackboxMaxFlushPeriod = 30 * time.Second
|
||||||
|
blackboxRecoveryFastCount = 5
|
||||||
|
)
|
||||||
|
|
||||||
|
var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
|
||||||
|
|
||||||
|
var (
|
||||||
|
blackboxExecCommand = exec.Command
|
||||||
|
blackboxNow = func() time.Time { return time.Now().UTC() }
|
||||||
|
)
|
||||||
|
|
||||||
|
type BlackboxMarker struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
CreatedAtUTC string `json:"created_at_utc"`
|
||||||
|
Host string `json:"host,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BlackboxTargetStatus struct {
|
||||||
|
EnrollmentID string `json:"enrollment_id"`
|
||||||
|
Device string `json:"device"`
|
||||||
|
FS platform.RemovableTarget `json:"fs"`
|
||||||
|
BootFolder string `json:"boot_folder"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
LastSyncAtUTC string `json:"last_sync_at_utc,omitempty"`
|
||||||
|
LastCycleDuration string `json:"last_cycle_duration,omitempty"`
|
||||||
|
FlushPeriod string `json:"flush_period"`
|
||||||
|
LastError string `json:"last_error,omitempty"`
|
||||||
|
Mountpoint string `json:"mountpoint,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BlackboxState struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
BootStartedAtUTC string `json:"boot_started_at_utc"`
|
||||||
|
BootFolder string `json:"boot_folder"`
|
||||||
|
UpdatedAtUTC string `json:"updated_at_utc"`
|
||||||
|
Targets []BlackboxTargetStatus `json:"targets"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type blackboxRuntime struct {
|
||||||
|
exportDir string
|
||||||
|
statePath string
|
||||||
|
system *platform.System
|
||||||
|
bootStarted time.Time
|
||||||
|
bootFolder string
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
workers map[string]*blackboxWorker
|
||||||
|
}
|
||||||
|
|
||||||
|
type discoveredBlackboxTarget struct {
|
||||||
|
marker BlackboxMarker
|
||||||
|
target platform.RemovableTarget
|
||||||
|
seenMount string
|
||||||
|
mountedByBee bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type blackboxWorker struct {
|
||||||
|
runtime *blackboxRuntime
|
||||||
|
enrollmentID string
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
target platform.RemovableTarget
|
||||||
|
marker BlackboxMarker
|
||||||
|
mountpoint string
|
||||||
|
mountedByBee bool
|
||||||
|
status string
|
||||||
|
lastSyncAt time.Time
|
||||||
|
lastDuration time.Duration
|
||||||
|
flushPeriod time.Duration
|
||||||
|
lastError string
|
||||||
|
fastCycles int
|
||||||
|
stopCh chan struct{}
|
||||||
|
stoppedCh chan struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
|
||||||
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
|
if exportDir == "" {
|
||||||
|
exportDir = DefaultExportDir
|
||||||
|
}
|
||||||
|
statePath = strings.TrimSpace(statePath)
|
||||||
|
if statePath == "" {
|
||||||
|
statePath = DefaultBlackboxStatePath
|
||||||
|
}
|
||||||
|
if system == nil {
|
||||||
|
system = platform.New()
|
||||||
|
}
|
||||||
|
bootStarted, err := bootStartedAtUTC()
|
||||||
|
if err != nil {
|
||||||
|
bootStarted = blackboxNow()
|
||||||
|
}
|
||||||
|
rt := &blackboxRuntime{
|
||||||
|
exportDir: exportDir,
|
||||||
|
statePath: statePath,
|
||||||
|
system: system,
|
||||||
|
bootStarted: bootStarted,
|
||||||
|
bootFolder: SupportBundleBaseName(bootStarted),
|
||||||
|
workers: make(map[string]*blackboxWorker),
|
||||||
|
}
|
||||||
|
_ = os.MkdirAll(filepath.Dir(statePath), 0755)
|
||||||
|
rt.persistState()
|
||||||
|
ticker := time.NewTicker(blackboxDiscoverInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
rt.reconcile()
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
rt.stopAll()
|
||||||
|
return ctx.Err()
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ReadBlackboxState(path string) (BlackboxState, error) {
|
||||||
|
path = strings.TrimSpace(path)
|
||||||
|
if path == "" {
|
||||||
|
path = DefaultBlackboxStatePath
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return BlackboxState{}, err
|
||||||
|
}
|
||||||
|
var state BlackboxState
|
||||||
|
if err := json.Unmarshal(raw, &state); err != nil {
|
||||||
|
return BlackboxState{}, err
|
||||||
|
}
|
||||||
|
return state, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
if target.Device == "" {
|
||||||
|
return BlackboxMarker{}, fmt.Errorf("device is required")
|
||||||
|
}
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
|
||||||
|
if err != nil {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
marker, _, err := readBlackboxMarker(mountpoint)
|
||||||
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
if marker.EnrollmentID == "" {
|
||||||
|
marker = BlackboxMarker{
|
||||||
|
Version: 1,
|
||||||
|
EnrollmentID: newBlackboxEnrollmentID(),
|
||||||
|
CreatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||||
|
Host: hostnameOr("unknown"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := writeBlackboxMarker(mountpoint, marker); err != nil {
|
||||||
|
return BlackboxMarker{}, err
|
||||||
|
}
|
||||||
|
return marker, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func DisableBlackboxTarget(device, enrollmentID string) error {
|
||||||
|
device = strings.TrimSpace(device)
|
||||||
|
enrollmentID = strings.TrimSpace(enrollmentID)
|
||||||
|
if device == "" && enrollmentID == "" {
|
||||||
|
return fmt.Errorf("device or enrollment_id is required")
|
||||||
|
}
|
||||||
|
system := platform.New()
|
||||||
|
targets, err := system.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, target := range targets {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
|
||||||
|
if mountErr != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
remove := false
|
||||||
|
marker, _, err := readBlackboxMarker(mountpoint)
|
||||||
|
if err == nil {
|
||||||
|
if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
|
||||||
|
remove = true
|
||||||
|
}
|
||||||
|
if device != "" && target.Device == device {
|
||||||
|
remove = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if remove {
|
||||||
|
err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
|
||||||
|
}
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
if remove {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return os.ErrNotExist
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) reconcile() {
|
||||||
|
discovered, _ := rt.discoverMarkedTargets()
|
||||||
|
|
||||||
|
rt.mu.Lock()
|
||||||
|
defer rt.mu.Unlock()
|
||||||
|
|
||||||
|
seen := make(map[string]struct{}, len(discovered))
|
||||||
|
for _, found := range discovered {
|
||||||
|
seen[found.marker.EnrollmentID] = struct{}{}
|
||||||
|
worker, ok := rt.workers[found.marker.EnrollmentID]
|
||||||
|
if !ok {
|
||||||
|
worker = newBlackboxWorker(rt, found)
|
||||||
|
rt.workers[found.marker.EnrollmentID] = worker
|
||||||
|
go worker.run()
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
worker.update(found)
|
||||||
|
}
|
||||||
|
for id, worker := range rt.workers {
|
||||||
|
if _, ok := seen[id]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
worker.stop()
|
||||||
|
delete(rt.workers, id)
|
||||||
|
}
|
||||||
|
rt.persistStateLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) stopAll() {
|
||||||
|
rt.mu.Lock()
|
||||||
|
workers := make([]*blackboxWorker, 0, len(rt.workers))
|
||||||
|
for _, worker := range rt.workers {
|
||||||
|
workers = append(workers, worker)
|
||||||
|
}
|
||||||
|
rt.workers = map[string]*blackboxWorker{}
|
||||||
|
rt.persistStateLocked()
|
||||||
|
rt.mu.Unlock()
|
||||||
|
for _, worker := range workers {
|
||||||
|
worker.stop()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
|
||||||
|
targets, err := rt.system.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var out []discoveredBlackboxTarget
|
||||||
|
for _, rawTarget := range targets {
|
||||||
|
target := sanitizeRemovableTarget(rawTarget)
|
||||||
|
if target.Device == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
marker, ok, err := readBlackboxMarker(mountpoint)
|
||||||
|
if mountedByBee && !ok {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
if err != nil || !ok || marker.EnrollmentID == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if mountedByBee {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
out = append(out, discoveredBlackboxTarget{
|
||||||
|
marker: marker,
|
||||||
|
target: target,
|
||||||
|
seenMount: mountpoint,
|
||||||
|
mountedByBee: mountedByBee,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(out, func(i, j int) bool {
|
||||||
|
return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
|
||||||
|
})
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
|
||||||
|
return &blackboxWorker{
|
||||||
|
runtime: rt,
|
||||||
|
enrollmentID: found.marker.EnrollmentID,
|
||||||
|
target: found.target,
|
||||||
|
marker: found.marker,
|
||||||
|
flushPeriod: blackboxMinFlushPeriod,
|
||||||
|
status: "running",
|
||||||
|
stopCh: make(chan struct{}),
|
||||||
|
stoppedCh: make(chan struct{}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) run() {
|
||||||
|
defer close(w.stoppedCh)
|
||||||
|
for {
|
||||||
|
start := time.Now()
|
||||||
|
err := w.syncCycle()
|
||||||
|
duration := time.Since(start)
|
||||||
|
w.finishCycle(duration, err)
|
||||||
|
|
||||||
|
wait := w.currentFlushPeriod()
|
||||||
|
timer := time.NewTimer(wait)
|
||||||
|
select {
|
||||||
|
case <-w.stopCh:
|
||||||
|
timer.Stop()
|
||||||
|
w.cleanup()
|
||||||
|
return
|
||||||
|
case <-timer.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.target = found.target
|
||||||
|
w.marker = found.marker
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) stop() {
|
||||||
|
select {
|
||||||
|
case <-w.stopCh:
|
||||||
|
default:
|
||||||
|
close(w.stopCh)
|
||||||
|
}
|
||||||
|
<-w.stoppedCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.flushPeriod
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.lastDuration = duration
|
||||||
|
if err != nil {
|
||||||
|
w.status = "degraded"
|
||||||
|
w.lastError = err.Error()
|
||||||
|
w.fastCycles = 0
|
||||||
|
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
|
||||||
|
} else {
|
||||||
|
w.status = "running"
|
||||||
|
w.lastSyncAt = blackboxNow()
|
||||||
|
w.lastError = ""
|
||||||
|
if duration <= w.flushPeriod/2 {
|
||||||
|
w.fastCycles++
|
||||||
|
} else {
|
||||||
|
w.fastCycles = 0
|
||||||
|
}
|
||||||
|
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||||
|
}
|
||||||
|
w.runtime.persistState()
|
||||||
|
}
|
||||||
|
|
||||||
|
func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
|
||||||
|
if current <= 0 {
|
||||||
|
current = blackboxMinFlushPeriod
|
||||||
|
}
|
||||||
|
if duration <= 0 {
|
||||||
|
duration = current
|
||||||
|
}
|
||||||
|
next := current
|
||||||
|
if duration > current {
|
||||||
|
growA := time.Duration(float64(current) * 1.25)
|
||||||
|
growB := time.Duration(float64(duration) * 1.25)
|
||||||
|
if growB > growA {
|
||||||
|
next = growB
|
||||||
|
} else {
|
||||||
|
next = growA
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if success && fastCycles >= blackboxRecoveryFastCount {
|
||||||
|
next = time.Duration(float64(current) * 0.9)
|
||||||
|
}
|
||||||
|
if next < blackboxMinFlushPeriod {
|
||||||
|
next = blackboxMinFlushPeriod
|
||||||
|
}
|
||||||
|
if next > blackboxMaxFlushPeriod {
|
||||||
|
next = blackboxMaxFlushPeriod
|
||||||
|
}
|
||||||
|
return next
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) syncCycle() error {
|
||||||
|
target, marker := w.snapshotTarget()
|
||||||
|
mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
w.recordMountpoint(mountpoint, mountedByBee)
|
||||||
|
|
||||||
|
root := filepath.Join(mountpoint, w.runtime.bootFolder)
|
||||||
|
if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := w.captureSnapshots(root); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return syncFilesystem(root)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) cleanup() {
|
||||||
|
w.mu.Lock()
|
||||||
|
mountpoint := w.mountpoint
|
||||||
|
mountedByBee := w.mountedByBee
|
||||||
|
w.mu.Unlock()
|
||||||
|
if mountedByBee && mountpoint != "" {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
return w.target, w.marker
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
w.mountpoint = mountpoint
|
||||||
|
w.mountedByBee = mountedByBee
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *blackboxWorker) captureSnapshots(root string) error {
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, svc := range supportBundleServices {
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, item := range supportBundleOptionalFiles {
|
||||||
|
if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) persistState() {
|
||||||
|
rt.mu.Lock()
|
||||||
|
defer rt.mu.Unlock()
|
||||||
|
rt.persistStateLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rt *blackboxRuntime) persistStateLocked() {
|
||||||
|
state := BlackboxState{
|
||||||
|
Status: "disabled",
|
||||||
|
BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
|
||||||
|
BootFolder: rt.bootFolder,
|
||||||
|
UpdatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||||
|
Targets: make([]BlackboxTargetStatus, 0, len(rt.workers)),
|
||||||
|
}
|
||||||
|
if len(rt.workers) > 0 {
|
||||||
|
state.Status = "running"
|
||||||
|
}
|
||||||
|
for _, worker := range rt.workers {
|
||||||
|
worker.mu.Lock()
|
||||||
|
targetState := BlackboxTargetStatus{
|
||||||
|
EnrollmentID: worker.enrollmentID,
|
||||||
|
Device: worker.target.Device,
|
||||||
|
FS: worker.target,
|
||||||
|
BootFolder: rt.bootFolder,
|
||||||
|
Status: worker.status,
|
||||||
|
FlushPeriod: worker.flushPeriod.String(),
|
||||||
|
LastError: worker.lastError,
|
||||||
|
Mountpoint: worker.mountpoint,
|
||||||
|
}
|
||||||
|
if !worker.lastSyncAt.IsZero() {
|
||||||
|
targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
|
||||||
|
}
|
||||||
|
if worker.lastDuration > 0 {
|
||||||
|
targetState.LastCycleDuration = worker.lastDuration.String()
|
||||||
|
}
|
||||||
|
if worker.status == "degraded" {
|
||||||
|
state.Status = "degraded"
|
||||||
|
}
|
||||||
|
worker.mu.Unlock()
|
||||||
|
state.Targets = append(state.Targets, targetState)
|
||||||
|
}
|
||||||
|
sort.Slice(state.Targets, func(i, j int) bool {
|
||||||
|
return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
|
||||||
|
})
|
||||||
|
_ = writeJSONAtomic(rt.statePath, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
func bootStartedAtUTC() (time.Time, error) {
|
||||||
|
raw, err := os.ReadFile("/proc/stat")
|
||||||
|
if err != nil {
|
||||||
|
return time.Time{}, err
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(line, "btime ") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Fields(line)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
sec, err := time.ParseDuration(parts[1] + "s")
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
return time.Unix(int64(sec/time.Second), 0).UTC(), nil
|
||||||
|
}
|
||||||
|
return time.Time{}, fmt.Errorf("boot time not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBlackboxEnrollmentID() string {
|
||||||
|
var buf [8]byte
|
||||||
|
if _, err := rand.Read(buf[:]); err != nil {
|
||||||
|
return fmt.Sprintf("bb-%d", time.Now().UnixNano())
|
||||||
|
}
|
||||||
|
return "bb-" + hex.EncodeToString(buf[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
|
||||||
|
target.Device = strings.TrimSpace(target.Device)
|
||||||
|
target.FSType = strings.TrimSpace(target.FSType)
|
||||||
|
target.Size = strings.TrimSpace(target.Size)
|
||||||
|
target.Label = strings.TrimSpace(target.Label)
|
||||||
|
target.Model = strings.TrimSpace(target.Model)
|
||||||
|
target.Mountpoint = strings.TrimSpace(target.Mountpoint)
|
||||||
|
return target
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
|
||||||
|
target = sanitizeRemovableTarget(target)
|
||||||
|
if target.Mountpoint != "" {
|
||||||
|
if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
|
||||||
|
return target.Mountpoint, false, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
|
||||||
|
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||||
|
return "", false, formatBlackboxMountTargetError(target, string(raw), err)
|
||||||
|
}
|
||||||
|
if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
|
||||||
|
_ = unmountTarget(mountpoint)
|
||||||
|
return "", false, err
|
||||||
|
}
|
||||||
|
return mountpoint, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func unmountTarget(mountpoint string) error {
|
||||||
|
_ = blackboxExecCommand("sync").Run()
|
||||||
|
raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
msg := strings.TrimSpace(string(raw))
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
|
||||||
|
raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
return BlackboxMarker{}, false, os.ErrNotExist
|
||||||
|
}
|
||||||
|
return BlackboxMarker{}, false, err
|
||||||
|
}
|
||||||
|
var marker BlackboxMarker
|
||||||
|
if err := json.Unmarshal(raw, &marker); err != nil {
|
||||||
|
return BlackboxMarker{}, false, err
|
||||||
|
}
|
||||||
|
return marker, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
|
||||||
|
if marker.Version == 0 {
|
||||||
|
marker.Version = 1
|
||||||
|
}
|
||||||
|
return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
|
||||||
|
}
|
||||||
|
|
||||||
|
func syncDirectoryTree(srcDir, dstDir string) error {
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(srcDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel = filepath.Clean(rel)
|
||||||
|
if rel == "." {
|
||||||
|
seen["."] = struct{}{}
|
||||||
|
return os.MkdirAll(dstDir, 0755)
|
||||||
|
}
|
||||||
|
seen[rel] = struct{}{}
|
||||||
|
dstPath := filepath.Join(dstDir, rel)
|
||||||
|
if d.IsDir() {
|
||||||
|
info, err := d.Info()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.MkdirAll(dstPath, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
return copyFileIfChanged(path, dstPath)
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return removeMissingPaths(dstDir, seen)
|
||||||
|
}
|
||||||
|
|
||||||
|
func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
|
||||||
|
return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(dstDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rel = filepath.Clean(rel)
|
||||||
|
if rel == "." {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if _, ok := seen[rel]; ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return os.RemoveAll(path)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyFileIfChanged(src, dst string) error {
|
||||||
|
info, err := os.Stat(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return os.MkdirAll(dst, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
srcData, err := os.ReadFile(src)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return writeFileAtomic(dst, srcData, info.Mode().Perm())
|
||||||
|
}
|
||||||
|
|
||||||
|
func captureCommandAtomic(dst string, name string, args ...string) error {
|
||||||
|
raw, err := blackboxExecCommand(name, args...).CombinedOutput()
|
||||||
|
if len(raw) == 0 {
|
||||||
|
if err != nil {
|
||||||
|
raw = []byte(err.Error() + "\n")
|
||||||
|
} else {
|
||||||
|
raw = []byte("no output\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return writeFileAtomic(dst, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJSONAtomic(path string, v any) error {
|
||||||
|
raw, err := json.MarshalIndent(v, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
raw = append(raw, '\n')
|
||||||
|
return writeFileAtomic(path, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := f.Write(data); err != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmp, path); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return syncFilesystem(filepath.Dir(path))
|
||||||
|
}
|
||||||
|
|
||||||
|
func syncFilesystem(path string) error {
|
||||||
|
return blackboxExecCommand("sync").Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureWritableBlackboxMountpoint(mountpoint string) error {
|
||||||
|
probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||||
|
}
|
||||||
|
name := probe.Name()
|
||||||
|
if closeErr := probe.Close(); closeErr != nil {
|
||||||
|
_ = os.Remove(name)
|
||||||
|
return closeErr
|
||||||
|
}
|
||||||
|
if err := os.Remove(name); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
|
||||||
|
msg := strings.TrimSpace(raw)
|
||||||
|
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||||
|
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||||
|
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: %w", msg, err)
|
||||||
|
}
|
||||||
52
audit/internal/app/blackbox_test.go
Normal file
52
audit/internal/app/blackbox_test.go
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
|
||||||
|
current := 2 * time.Second
|
||||||
|
got := adjustFlushPeriod(current, 4*time.Second, false, 0)
|
||||||
|
if got <= current {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
|
||||||
|
current := 10 * time.Second
|
||||||
|
got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
|
||||||
|
if got >= current {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
|
||||||
|
}
|
||||||
|
if got < blackboxMinFlushPeriod {
|
||||||
|
t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadBlackboxState(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "blackbox-state.json")
|
||||||
|
want := BlackboxState{
|
||||||
|
Status: "running",
|
||||||
|
BootStartedAtUTC: "2026-04-24T00:00:00Z",
|
||||||
|
BootFolder: "boot-folder",
|
||||||
|
UpdatedAtUTC: "2026-04-24T00:00:01Z",
|
||||||
|
Targets: []BlackboxTargetStatus{{
|
||||||
|
EnrollmentID: "bb-1",
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
Status: "running",
|
||||||
|
FlushPeriod: "1s",
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
if err := writeJSONAtomic(path, want); err != nil {
|
||||||
|
t.Fatalf("writeJSONAtomic: %v", err)
|
||||||
|
}
|
||||||
|
got, err := ReadBlackboxState(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadBlackboxState: %v", err)
|
||||||
|
}
|
||||||
|
if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
|
||||||
|
t.Fatalf("state=%+v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -21,12 +21,12 @@ type ComponentStatusDB struct {
|
|||||||
|
|
||||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||||
type ComponentStatusRecord struct {
|
type ComponentStatusRecord struct {
|
||||||
ComponentKey string `json:"component_key"`
|
ComponentKey string `json:"component_key"`
|
||||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||||
LastChangedAt time.Time `json:"last_changed_at"`
|
LastChangedAt time.Time `json:"last_changed_at"`
|
||||||
ErrorSummary string `json:"error_summary,omitempty"`
|
ErrorSummary string `json:"error_summary,omitempty"`
|
||||||
History []ComponentStatusEntry `json:"history"`
|
History []ComponentStatusEntry `json:"history"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// ComponentStatusEntry is one observation written to a component's history.
|
// ComponentStatusEntry is one observation written to a component's history.
|
||||||
@@ -46,7 +46,7 @@ func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
|||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
data, err := os.ReadFile(path)
|
data, err := readFileLimited(path, 10<<20)
|
||||||
if err != nil && !os.IsNotExist(err) {
|
if err != nil && !os.IsNotExist(err) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -179,7 +179,9 @@ func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
|||||||
|
|
||||||
// Map SAT target to component keys.
|
// Map SAT target to component keys.
|
||||||
switch target {
|
switch target {
|
||||||
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
|
||||||
|
"amd-stress", "amd-mem", "amd-bandwidth":
|
||||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||||
case "memory", "memory-stress", "sat-stress":
|
case "memory", "memory-stress", "sat-stress":
|
||||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package app
|
|||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
|||||||
}
|
}
|
||||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||||
|
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||||
}
|
}
|
||||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||||
applyMemorySAT(snap.Memory, summary)
|
applyMemorySAT(snap.Memory, summary)
|
||||||
@@ -32,6 +34,100 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *C
|
|||||||
applyComponentStatusDB(snap, db)
|
applyComponentStatusDB(snap, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type nvidiaPerGPUStatus struct {
|
||||||
|
runStatus string
|
||||||
|
reason string
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||||
|
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].Telemetry == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, ok := telemetryInt(rawIdx)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
st, ok := statusByIndex[idx]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
runDir := matches[len(matches)-1]
|
||||||
|
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||||
|
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||||
|
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||||
|
if err != nil || len(files) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||||
|
for _, file := range files {
|
||||||
|
raw, err := os.ReadFile(file)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
kv := parseKeyValueSummary(string(raw))
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[idx] = nvidiaPerGPUStatus{
|
||||||
|
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||||
|
reason: strings.TrimSpace(kv["reason"]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 {
|
||||||
|
return nil, "", false
|
||||||
|
}
|
||||||
|
return out, runAtUTC, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func telemetryInt(v any) (int, bool) {
|
||||||
|
switch value := v.(type) {
|
||||||
|
case int:
|
||||||
|
return value, true
|
||||||
|
case int32:
|
||||||
|
return int(value), true
|
||||||
|
case int64:
|
||||||
|
return int(value), true
|
||||||
|
case float64:
|
||||||
|
return int(value), true
|
||||||
|
case string:
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return n, true
|
||||||
|
default:
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type satSummary struct {
|
type satSummary struct {
|
||||||
runAtUTC string
|
runAtUTC string
|
||||||
overall string
|
overall string
|
||||||
@@ -176,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||||
|
if component == nil || satStatus == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
current := strings.TrimSpace(ptrString(component.Status))
|
||||||
|
newSeverity := statusSeverity(satStatus)
|
||||||
|
currentSeverity := statusSeverity(current)
|
||||||
|
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||||
|
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||||
|
component.Status = appStringPtr(satStatus)
|
||||||
|
component.ErrorDescription = appStringPtr(description)
|
||||||
|
if strings.TrimSpace(changedAt) != "" {
|
||||||
|
component.StatusChangedAt = appStringPtr(changedAt)
|
||||||
|
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||||
|
Status: satStatus,
|
||||||
|
ChangedAt: changedAt,
|
||||||
|
Details: appStringPtr(description),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func statusSeverity(status string) int {
|
func statusSeverity(status string) int {
|
||||||
switch strings.TrimSpace(status) {
|
switch strings.TrimSpace(status) {
|
||||||
case "Critical":
|
case "Critical":
|
||||||
|
|||||||
@@ -59,3 +59,51 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||||
|
baseDir := t.TempDir()
|
||||||
|
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "VideoController"
|
||||||
|
manufacturer := "NVIDIA Corporation"
|
||||||
|
bdf0 := "0000:4b:00.0"
|
||||||
|
bdf1 := "0000:4f:00.0"
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||||
|
{
|
||||||
|
DeviceClass: &class,
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
BDF: &bdf0,
|
||||||
|
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
DeviceClass: &class,
|
||||||
|
Manufacturer: &manufacturer,
|
||||||
|
BDF: &bdf1,
|
||||||
|
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
|
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||||
|
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||||
|
}
|
||||||
|
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||||
|
got := "<nil>"
|
||||||
|
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||||
|
got = *snap.PCIeDevices[1].ErrorDescription
|
||||||
|
}
|
||||||
|
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package app
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/tar"
|
"archive/tar"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@@ -14,12 +15,17 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var supportBundleServices = []string{
|
var supportBundleServices = []string{
|
||||||
|
"bee-blackbox.service",
|
||||||
"bee-audit.service",
|
"bee-audit.service",
|
||||||
"bee-web.service",
|
"bee-web.service",
|
||||||
"bee-network.service",
|
"bee-network.service",
|
||||||
"bee-nvidia.service",
|
"bee-nvidia.service",
|
||||||
"bee-preflight.service",
|
"bee-preflight.service",
|
||||||
|
"bee-selfheal.service",
|
||||||
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
|
"nvidia-dcgm.service",
|
||||||
|
"nvidia-fabricmanager.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
var supportBundleCommands = []struct {
|
var supportBundleCommands = []struct {
|
||||||
@@ -38,17 +44,112 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||||
|
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v dmesg >/dev/null 2>&1; then
|
||||||
|
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||||
|
else
|
||||||
|
echo "dmesg not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
nvidia-smi topo -m 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "nvidia-smi not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/systemctl-nvidia-units.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v systemctl >/dev/null 2>&1; then
|
||||||
|
echo "systemctl not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "=== unit files ==="
|
||||||
|
systemctl list-unit-files --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== active units ==="
|
||||||
|
systemctl list-units --no-pager --all 'nvidia*' 'fabric*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== failed units ==="
|
||||||
|
systemctl --failed --no-pager 2>&1 | grep -iE 'nvidia|fabric' || echo "no failed nvidia/fabric units"
|
||||||
|
`}},
|
||||||
|
{name: "system/fabric-manager-paths.txt", cmd: []string{"sh", "-c", `
|
||||||
|
for candidate in \
|
||||||
|
/usr/bin/nvidia-fabricmanager \
|
||||||
|
/usr/bin/nv-fabricmanager \
|
||||||
|
/usr/bin/nvidia-fabricmanagerd \
|
||||||
|
/usr/bin/nvlsm; do
|
||||||
|
if [ -e "$candidate" ]; then
|
||||||
|
echo "=== $candidate ==="
|
||||||
|
ls -l "$candidate" 2>&1 || true
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if ! ls /usr/bin/nvidia-fabricmanager /usr/bin/nv-fabricmanager /usr/bin/nvidia-fabricmanagerd /usr/bin/nvlsm >/dev/null 2>&1; then
|
||||||
|
echo "no fabric manager binaries found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
|
echo "lspci not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||||
|
found=1
|
||||||
|
echo "=== GPU $gpu ==="
|
||||||
|
lspci -s "$gpu" -vv 2>&1 || true
|
||||||
|
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||||
|
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||||
|
echo
|
||||||
|
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||||
|
lspci -s "$bridge" -vv 2>&1 || true
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no NVIDIA PCI devices found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||||
for d in /sys/bus/pci/devices/*/; do
|
for d in /sys/bus/pci/devices/*/; do
|
||||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||||
[ "$vendor" = "0x10de" ] || continue
|
[ "$vendor" = "0x10de" ] || continue
|
||||||
dev=$(basename "$d")
|
class=$(cat "$d/class" 2>/dev/null)
|
||||||
|
case "$class" in
|
||||||
|
0x030000|0x030200) ;;
|
||||||
|
*) continue ;;
|
||||||
|
esac
|
||||||
|
dev=$(basename "$d")
|
||||||
echo "=== $dev ==="
|
echo "=== $dev ==="
|
||||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
`}},
|
||||||
|
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||||
|
found=0
|
||||||
|
for dev in /sys/bus/pci/devices/*; do
|
||||||
|
[ -e "$dev" ] || continue
|
||||||
|
bdf=$(basename "$dev")
|
||||||
|
block=""
|
||||||
|
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||||
|
if [ -r "$dev/$f" ]; then
|
||||||
|
if [ -z "$block" ]; then
|
||||||
|
block=1
|
||||||
|
found=1
|
||||||
|
echo "=== $bdf ==="
|
||||||
|
fi
|
||||||
|
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ -n "$block" ]; then
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no PCIe AER sysfs counters found"
|
||||||
|
fi
|
||||||
`}},
|
`}},
|
||||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||||
if ! command -v ethtool >/dev/null 2>&1; then
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
@@ -135,9 +236,13 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
}{
|
}{
|
||||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
|
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||||
|
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||||
|
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||||
|
{name: "system/fabricmanager/nvlsm.log", src: "/var/log/fabricmanager/nvlsm.log"},
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||||
|
|
||||||
func BuildSupportBundle(exportDir string) (string, error) {
|
func BuildSupportBundle(exportDir string) (string, error) {
|
||||||
exportDir = strings.TrimSpace(exportDir)
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
@@ -151,9 +256,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
host := sanitizeFilename(hostnameOr("unknown"))
|
now := time.Now().UTC()
|
||||||
ts := time.Now().UTC().Format("20060102-150405")
|
|
||||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -185,13 +290,24 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
archiveName := SupportBundleBaseName(now) + ".tar.gz"
|
||||||
|
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return archivePath, nil
|
return archivePath, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SupportBundleBaseName(at time.Time) string {
|
||||||
|
at = at.UTC()
|
||||||
|
date := at.Format("2006-01-02")
|
||||||
|
tod := at.Format("150405")
|
||||||
|
ver := bundleVersion()
|
||||||
|
model := serverModelForBundle()
|
||||||
|
sn := serverSerialForBundle()
|
||||||
|
return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
|
||||||
|
}
|
||||||
|
|
||||||
func LatestSupportBundlePath() (string, error) {
|
func LatestSupportBundlePath() (string, error) {
|
||||||
return latestSupportBundlePath(os.TempDir())
|
return latestSupportBundlePath(os.TempDir())
|
||||||
}
|
}
|
||||||
@@ -315,6 +431,13 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
|||||||
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
fmt.Fprintf(&body, "host=%s\n", hostnameOr("unknown"))
|
||||||
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
fmt.Fprintf(&body, "generated_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
fmt.Fprintf(&body, "export_dir=%s\n", exportDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(filepath.Join(exportDir, "bee-bench", "power-source-autotune.json")); err == nil && cfg != nil {
|
||||||
|
fmt.Fprintf(&body, "power_autotune_selected_source=%s\n", cfg.SelectedSource)
|
||||||
|
fmt.Fprintf(&body, "power_autotune_updated_at=%s\n", cfg.UpdatedAt.UTC().Format(time.RFC3339))
|
||||||
|
if strings.TrimSpace(cfg.Reason) != "" {
|
||||||
|
fmt.Fprintf(&body, "power_autotune_reason=%s\n", cfg.Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
fmt.Fprintf(&body, "\nfiles:\n")
|
fmt.Fprintf(&body, "\nfiles:\n")
|
||||||
|
|
||||||
var files []string
|
var files []string
|
||||||
@@ -342,6 +465,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
|||||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func bundleVersion() string {
|
||||||
|
v := buildVersion()
|
||||||
|
v = strings.TrimPrefix(v, "v")
|
||||||
|
v = strings.TrimPrefix(v, "V")
|
||||||
|
if v == "" || v == "unknown" {
|
||||||
|
return "0.0"
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func serverModelForBundle() string {
|
||||||
|
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
key, val, ok := strings.Cut(line, ": ")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(key) == "Product Name" {
|
||||||
|
val = strings.TrimSpace(val)
|
||||||
|
if val == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return strings.ReplaceAll(val, " ", "_")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
func serverSerialForBundle() string {
|
||||||
|
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
key, val, ok := strings.Cut(line, ": ")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(key) == "Serial Number" {
|
||||||
|
val = strings.TrimSpace(val)
|
||||||
|
if val == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
func buildVersion() string {
|
func buildVersion() string {
|
||||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
return exec.CommandContext(ctx, name, args...).Output()
|
return exec.CommandContext(ctx, name, args...).Output()
|
||||||
}
|
}
|
||||||
|
|
||||||
func interfaceHasCarrier(iface string) bool {
|
|
||||||
raw, err := readNetCarrierFile(iface)
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return strings.TrimSpace(raw) == "1"
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if interfaceHasCarrier(iface) {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
enriched++
|
||||||
enriched++
|
continue
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
}
|
}
|
||||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
val := strings.TrimSpace(trimmed[idx+1:])
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
|
case key == "identifier":
|
||||||
|
s := parseSFPIdentifier(val)
|
||||||
|
dev.SFPIdentifier = &s
|
||||||
|
t := true
|
||||||
|
dev.SFPPresent = &t
|
||||||
|
changed = true
|
||||||
|
case key == "connector":
|
||||||
|
s := parseSFPConnector(val)
|
||||||
|
dev.SFPConnector = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor name":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPVendor = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor pn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPPartNumber = &s
|
||||||
|
changed = true
|
||||||
|
case key == "vendor sn":
|
||||||
|
s := strings.TrimSpace(val)
|
||||||
|
dev.SFPSerialNumber = &s
|
||||||
|
changed = true
|
||||||
|
case strings.Contains(key, "laser wavelength"):
|
||||||
|
if f, ok := firstFloat(val); ok {
|
||||||
|
dev.SFPWavelengthNM = &f
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
case strings.Contains(key, "module temperature"):
|
case strings.Contains(key, "module temperature"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
dev.SFPTemperatureC = &f
|
dev.SFPTemperatureC = &f
|
||||||
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
|||||||
return changed
|
return changed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||||
|
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||||
|
func parseSFPIdentifier(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||||
|
// e.g. "0x07 (LC)" → "LC".
|
||||||
|
func parseSFPConnector(val string) string {
|
||||||
|
if s := extractParens(val); s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
|
||||||
|
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||||
|
|
||||||
|
func extractParens(s string) string {
|
||||||
|
m := parenRe.FindStringSubmatch(s)
|
||||||
|
if len(m) < 2 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(m[1])
|
||||||
|
}
|
||||||
|
|
||||||
func parseSFPDOM(raw string) map[string]any {
|
func parseSFPDOM(raw string) map[string]any {
|
||||||
dev := schema.HardwarePCIeDevice{}
|
dev := schema.HardwarePCIeDevice{}
|
||||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||||
return map[string]any{}
|
return map[string]any{}
|
||||||
}
|
}
|
||||||
out := map[string]any{}
|
out := map[string]any{}
|
||||||
|
if dev.SFPPresent != nil {
|
||||||
|
out["sfp_present"] = *dev.SFPPresent
|
||||||
|
}
|
||||||
|
if dev.SFPIdentifier != nil {
|
||||||
|
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||||
|
}
|
||||||
|
if dev.SFPConnector != nil {
|
||||||
|
out["sfp_connector"] = *dev.SFPConnector
|
||||||
|
}
|
||||||
|
if dev.SFPVendor != nil {
|
||||||
|
out["sfp_vendor"] = *dev.SFPVendor
|
||||||
|
}
|
||||||
|
if dev.SFPPartNumber != nil {
|
||||||
|
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||||
|
}
|
||||||
|
if dev.SFPSerialNumber != nil {
|
||||||
|
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||||
|
}
|
||||||
|
if dev.SFPWavelengthNM != nil {
|
||||||
|
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||||
|
}
|
||||||
if dev.SFPTemperatureC != nil {
|
if dev.SFPTemperatureC != nil {
|
||||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
|
|||||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
ethtoolModuleQuery = func(string) (string, error) {
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||||
t.Fatal("ethtool -m should not be called without carrier")
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
class := "EthernetController"
|
class := "EthernetController"
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
|
|||||||
@@ -13,7 +13,9 @@ import (
|
|||||||
const nvidiaVendorID = 0x10de
|
const nvidiaVendorID = 0x10de
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
|
Index int
|
||||||
BDF string
|
BDF string
|
||||||
|
Name string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
TemperatureC *float64
|
TemperatureC *float64
|
||||||
@@ -72,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if v := strings.TrimSpace(info.Name); v != "" {
|
||||||
|
devs[i].Model = &v
|
||||||
|
}
|
||||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||||
devs[i].SerialNumber = &v
|
devs[i].SerialNumber = &v
|
||||||
}
|
}
|
||||||
@@ -98,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||||
out, err := exec.Command(
|
out, err := exec.Command(
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
).Output()
|
).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -122,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
if len(rec) == 0 {
|
if len(rec) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(rec) < 13 {
|
if len(rec) < 14 {
|
||||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||||
}
|
}
|
||||||
|
|
||||||
bdf := normalizePCIeBDF(rec[1])
|
bdf := normalizePCIeBDF(rec[1])
|
||||||
@@ -132,18 +137,20 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
|
Index: parseRequiredInt(rec[0]),
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Name: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
Serial: strings.TrimSpace(rec[3]),
|
||||||
TemperatureC: parseMaybeFloat(rec[4]),
|
VBIOS: strings.TrimSpace(rec[4]),
|
||||||
PowerW: parseMaybeFloat(rec[5]),
|
TemperatureC: parseMaybeFloat(rec[5]),
|
||||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
PowerW: parseMaybeFloat(rec[6]),
|
||||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||||
HWSlowdown: parseMaybeBool(rec[8]),
|
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
HWSlowdown: parseMaybeBool(rec[9]),
|
||||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||||
|
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||||
}
|
}
|
||||||
result[bdf] = info
|
result[bdf] = info
|
||||||
}
|
}
|
||||||
@@ -187,6 +194,14 @@ func parseMaybeInt(v string) *int {
|
|||||||
return &n
|
return &n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseRequiredInt(v string) int {
|
||||||
|
n, err := strconv.Atoi(strings.TrimSpace(v))
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
func pcieLinkGenLabel(gen int) string {
|
func pcieLinkGenLabel(gen int) string {
|
||||||
return fmt.Sprintf("Gen%d", gen)
|
return fmt.Sprintf("Gen%d", gen)
|
||||||
}
|
}
|
||||||
@@ -240,6 +255,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||||
|
if dev.Telemetry == nil {
|
||||||
|
dev.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
dev.Telemetry["nvidia_gpu_index"] = info.Index
|
||||||
if info.TemperatureC != nil {
|
if info.TemperatureC != nil {
|
||||||
dev.TemperatureC = info.TemperatureC
|
dev.TemperatureC = info.TemperatureC
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse failed: %v", err)
|
t.Fatalf("parse failed: %v", err)
|
||||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
|||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("gpu by normalized bdf not found")
|
t.Fatalf("gpu by normalized bdf not found")
|
||||||
}
|
}
|
||||||
|
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||||
|
t.Fatalf("name: got %q", gpu.Name)
|
||||||
|
}
|
||||||
if gpu.Serial != "GPU-SERIAL-1" {
|
if gpu.Serial != "GPU-SERIAL-1" {
|
||||||
t.Fatalf("serial: got %q", gpu.Serial)
|
t.Fatalf("serial: got %q", gpu.Serial)
|
||||||
}
|
}
|
||||||
@@ -86,6 +89,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||||
}
|
}
|
||||||
|
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
|
||||||
|
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
|
||||||
|
}
|
||||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||||
t.Fatalf("status: got %v", out[0].Status)
|
t.Fatalf("status: got %v", out[0].Status)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package collector
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||||
|
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||||
|
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||||
|
bmcPatterns := []string{
|
||||||
|
"management system chip",
|
||||||
|
"management controller",
|
||||||
|
"ibmc",
|
||||||
|
"idrac",
|
||||||
|
"ilo vga",
|
||||||
|
"aspeed",
|
||||||
|
"matrox",
|
||||||
|
}
|
||||||
|
for _, bad := range bmcPatterns {
|
||||||
|
if strings.Contains(d, bad) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||||
internalAMDPatterns := []string{
|
internalAMDPatterns := []string{
|
||||||
"dummy function",
|
"dummy function",
|
||||||
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
|
|
||||||
// SVendor/SDevice available but not in schema — skip
|
// SVendor/SDevice available but not in schema — skip
|
||||||
|
|
||||||
|
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
return dev
|
return dev
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
|||||||
return value, true
|
return value, true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||||
|
// speed is below the maximum negotiated speed supported by both ends.
|
||||||
|
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||||
|
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||||
|
warn := statusWarning
|
||||||
|
dev.Status = &warn
|
||||||
|
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||||
|
dev.ErrorDescription = &desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
||||||
|
// Returns 0 for unrecognised values so comparisons fail safe.
|
||||||
|
func pcieLinkSpeedRank(gen string) int {
|
||||||
|
switch gen {
|
||||||
|
case "Gen1":
|
||||||
|
return 1
|
||||||
|
case "Gen2":
|
||||||
|
return 2
|
||||||
|
case "Gen3":
|
||||||
|
return 3
|
||||||
|
case "Gen4":
|
||||||
|
return 4
|
||||||
|
case "Gen5":
|
||||||
|
return 5
|
||||||
|
case "Gen6":
|
||||||
|
return 6
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func normalizePCILinkSpeed(raw string) string {
|
func normalizePCILinkSpeed(raw string) string {
|
||||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||||
switch {
|
switch {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
|||||||
{name: "raid", class: "RAID bus controller", want: true},
|
{name: "raid", class: "RAID bus controller", want: true},
|
||||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||||
{name: "vga", class: "VGA compatible controller", want: true},
|
{name: "vga", class: "VGA compatible controller", want: true},
|
||||||
|
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||||
|
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
|
||||||
|
ptr := func(s string) *string { return &s }
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
linkSpeed *string
|
||||||
|
maxSpeed *string
|
||||||
|
wantWarning bool
|
||||||
|
wantGenIn string // substring expected in ErrorDescription when warning
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "degraded Gen1 vs Gen5",
|
||||||
|
linkSpeed: ptr("Gen1"),
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: true,
|
||||||
|
wantGenIn: "Gen1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "at max Gen5",
|
||||||
|
linkSpeed: ptr("Gen5"),
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "degraded Gen4 vs Gen5",
|
||||||
|
linkSpeed: ptr("Gen4"),
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: true,
|
||||||
|
wantGenIn: "Gen4",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing current speed — no warning",
|
||||||
|
linkSpeed: nil,
|
||||||
|
maxSpeed: ptr("Gen5"),
|
||||||
|
wantWarning: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing max speed — no warning",
|
||||||
|
linkSpeed: ptr("Gen1"),
|
||||||
|
maxSpeed: nil,
|
||||||
|
wantWarning: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
ok := statusOK
|
||||||
|
dev.Status = &ok
|
||||||
|
dev.LinkSpeed = tt.linkSpeed
|
||||||
|
dev.MaxLinkSpeed = tt.maxSpeed
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
gotWarn := dev.Status != nil && *dev.Status == statusWarning
|
||||||
|
if gotWarn != tt.wantWarning {
|
||||||
|
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
|
||||||
|
}
|
||||||
|
if tt.wantWarning {
|
||||||
|
if dev.ErrorDescription == nil {
|
||||||
|
t.Fatal("expected ErrorDescription to be set")
|
||||||
|
}
|
||||||
|
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
|
||||||
|
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if dev.ErrorDescription != nil {
|
||||||
|
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -160,11 +160,57 @@ type psuSDR struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var psuSlotPatterns = []*regexp.Regexp{
|
var psuSlotPatterns = []*regexp.Regexp{
|
||||||
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`),
|
// MSI/underscore style: PSU1_POWER_IN, PSU2_POWER_OUT — underscore is \w so \b
|
||||||
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`),
|
// does not fire after the digit; match explicitly with underscore terminator.
|
||||||
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bpsu([0-9]+)_`),
|
||||||
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b`), // PSU1, PS1, ps 2
|
||||||
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`),
|
regexp.MustCompile(`(?i)\bps\s*([0-9]+)\b`), // PS 6, PS6
|
||||||
|
regexp.MustCompile(`(?i)\bpws\s*([0-9]+)\b`), // PWS1
|
||||||
|
regexp.MustCompile(`(?i)\bpower\s*supply(?:\s*bay)?\s*([0-9]+)\b`), // Power Supply 1, Power Supply Bay 3
|
||||||
|
regexp.MustCompile(`(?i)\bbay\s*([0-9]+)\b`), // Bay 1
|
||||||
|
// Fallback for xFusion-style generic numbered PSU sensors (Power1, Power2, …).
|
||||||
|
// Must be last: "power supply N" is already caught by the pattern above.
|
||||||
|
regexp.MustCompile(`(?i)\bpower([0-9]+)\b`),
|
||||||
|
}
|
||||||
|
|
||||||
|
// psuInputPowerKeywords matches AC-input power sensor names across vendors:
|
||||||
|
// MSI: PSU1_POWER_IN, PSU1_PIN
|
||||||
|
// MLT: PSU1_PIN
|
||||||
|
// xFusion: (matched via default fallback — no explicit keyword)
|
||||||
|
// HPE: PS1 Input Power, PS1 Input Watts
|
||||||
|
func isPSUInputPower(name string) bool {
|
||||||
|
return strings.Contains(name, "input power") ||
|
||||||
|
strings.Contains(name, "input watts") ||
|
||||||
|
strings.Contains(name, "_pin") ||
|
||||||
|
strings.Contains(name, " pin") ||
|
||||||
|
strings.Contains(name, "_power_in") ||
|
||||||
|
strings.Contains(name, "power_in")
|
||||||
|
}
|
||||||
|
|
||||||
|
// isPSUOutputPower matches DC-output power sensor names across vendors:
|
||||||
|
// MSI: PSU1_POWER_OUT
|
||||||
|
// MLT: PSU1_POUT
|
||||||
|
// xFusion: PS1 POut
|
||||||
|
func isPSUOutputPower(name string) bool {
|
||||||
|
return strings.Contains(name, "output power") ||
|
||||||
|
strings.Contains(name, "output watts") ||
|
||||||
|
strings.Contains(name, "_pout") ||
|
||||||
|
strings.Contains(name, " pout") ||
|
||||||
|
strings.Contains(name, "_power_out") ||
|
||||||
|
strings.Contains(name, "power_out") ||
|
||||||
|
strings.Contains(name, "power supply bay") ||
|
||||||
|
strings.Contains(name, "psu bay")
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseBoundedFloat parses a numeric value from an SDR value field and
|
||||||
|
// validates it is within (0, max]. Returns nil for zero, negative, or
|
||||||
|
// out-of-range values — these indicate missing/off/fault sensor readings.
|
||||||
|
func parseBoundedFloat(raw string, max float64) *float64 {
|
||||||
|
v := parseFloatPtr(raw)
|
||||||
|
if v == nil || *v <= 0 || *v > max {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return v
|
||||||
}
|
}
|
||||||
|
|
||||||
func parsePSUSDR(raw string) map[int]psuSDR {
|
func parsePSUSDR(raw string) map[int]psuSDR {
|
||||||
@@ -194,24 +240,59 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
|||||||
|
|
||||||
lowerName := strings.ToLower(name)
|
lowerName := strings.ToLower(name)
|
||||||
switch {
|
switch {
|
||||||
case strings.Contains(lowerName, "input power"):
|
case isPSUInputPower(lowerName):
|
||||||
entry.inputPowerW = parseFloatPtr(value)
|
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||||
case strings.Contains(lowerName, "output power"):
|
case isPSUOutputPower(lowerName):
|
||||||
entry.outputPowerW = parseFloatPtr(value)
|
entry.outputPowerW = parseBoundedFloat(value, 6000)
|
||||||
case strings.Contains(lowerName, "power supply bay"), strings.Contains(lowerName, "psu bay"):
|
|
||||||
entry.outputPowerW = parseFloatPtr(value)
|
|
||||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||||
entry.inputVoltage = parseFloatPtr(value)
|
entry.inputVoltage = parseFloatPtr(value)
|
||||||
case strings.Contains(lowerName, "temp"):
|
case strings.Contains(lowerName, "temp"):
|
||||||
entry.temperatureC = parseFloatPtr(value)
|
entry.temperatureC = parseFloatPtr(value)
|
||||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||||
entry.healthPct = parsePercentPtr(value)
|
entry.healthPct = parsePercentPtr(value)
|
||||||
|
default:
|
||||||
|
// Generic PSU power reading: sensor matched a slot pattern but carries
|
||||||
|
// no input/output keyword (e.g. xFusion "Power1", "Power2"). Treat as
|
||||||
|
// AC input if the value looks like wattage and no better data is set yet.
|
||||||
|
if entry.inputPowerW == nil {
|
||||||
|
entry.inputPowerW = parseBoundedFloat(value, 6000)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
out[slot] = entry
|
out[slot] = entry
|
||||||
}
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PSUSlotPower holds SDR power readings for one PSU slot.
|
||||||
|
// Slot key used by PSUSlotsFromSDR is the 0-based index string,
|
||||||
|
// matching HardwarePowerSupply.Slot in the audit schema.
|
||||||
|
type PSUSlotPower struct {
|
||||||
|
InputW *float64 `json:"input_w,omitempty"`
|
||||||
|
OutputW *float64 `json:"output_w,omitempty"`
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PSUSlotsFromSDR parses `ipmitool sdr` output and returns per-slot PSU data
|
||||||
|
// using the same battle-tested slot patterns as the hardware audit collector.
|
||||||
|
// Works across MSI (PSU1_POWER_IN), xFusion (Power1, PS1 POut), MLT (PSU1_PIN).
|
||||||
|
// Slot keys are 0-based index strings matching HardwarePowerSupply.Slot.
|
||||||
|
func PSUSlotsFromSDR(sdrOutput string) map[string]PSUSlotPower {
|
||||||
|
sdr := parsePSUSDR(sdrOutput)
|
||||||
|
if len(sdr) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make(map[string]PSUSlotPower, len(sdr))
|
||||||
|
for slot, entry := range sdr {
|
||||||
|
key := strconv.Itoa(slot - 1) // audit uses 0-based slot
|
||||||
|
out[key] = PSUSlotPower{
|
||||||
|
InputW: entry.inputPowerW,
|
||||||
|
OutputW: entry.outputPowerW,
|
||||||
|
Status: entry.status,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
func synthesizePSUsFromSDR(sdr map[int]psuSDR) []schema.HardwarePowerSupply {
|
||||||
if len(sdr) == 0 {
|
if len(sdr) == 0 {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -49,6 +49,10 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
|||||||
{name: "PWS1 Status", want: 1},
|
{name: "PWS1 Status", want: 1},
|
||||||
{name: "Power Supply Bay 8", want: 8},
|
{name: "Power Supply Bay 8", want: 8},
|
||||||
{name: "PS 6 Input Power", want: 6},
|
{name: "PS 6 Input Power", want: 6},
|
||||||
|
// MSI underscore format — \b does not fire between digit and '_'
|
||||||
|
{name: "PSU1_POWER_IN", want: 1},
|
||||||
|
{name: "PSU2_POWER_OUT", want: 2},
|
||||||
|
{name: "PSU4_STATUS", want: 4},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
@@ -59,6 +63,31 @@ func TestParsePSUSlotVendorVariants(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParsePSUSDRMSIFormat(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
raw := `
|
||||||
|
PSU1_STATUS | F1h | ok
|
||||||
|
PSU1_POWER_OUT | 928 Watts | ok
|
||||||
|
PSU1_POWER_IN | 976 Watts | ok
|
||||||
|
PSU2_STATUS | F2h | ok
|
||||||
|
PSU2_POWER_OUT | 944 Watts | ok
|
||||||
|
PSU2_POWER_IN | 992 Watts | ok
|
||||||
|
`
|
||||||
|
got := parsePSUSDR(raw)
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("len(got)=%d want 2", len(got))
|
||||||
|
}
|
||||||
|
if got[1].inputPowerW == nil || *got[1].inputPowerW != 976 {
|
||||||
|
t.Fatalf("psu1 input power=%v want 976", got[1].inputPowerW)
|
||||||
|
}
|
||||||
|
if got[1].outputPowerW == nil || *got[1].outputPowerW != 928 {
|
||||||
|
t.Fatalf("psu1 output power=%v want 928", got[1].outputPowerW)
|
||||||
|
}
|
||||||
|
if got[2].inputPowerW == nil || *got[2].inputPowerW != 992 {
|
||||||
|
t.Fatalf("psu2 input power=%v want 992", got[2].inputPowerW)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
func TestSynthesizePSUsFromSDR(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
4878
audit/internal/platform/benchmark.go
Normal file
4878
audit/internal/platform/benchmark.go
Normal file
File diff suppressed because it is too large
Load Diff
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
@@ -0,0 +1,735 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
benchmarkPowerAutotuneVersion = 1
|
||||||
|
benchmarkPowerAutotuneIdleSec = 60
|
||||||
|
benchmarkPowerAutotuneLoadSec = 90
|
||||||
|
benchmarkPowerAutotuneSampleInterval = 3
|
||||||
|
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
||||||
|
baseDir = strings.TrimSpace(baseDir)
|
||||||
|
if baseDir == "" {
|
||||||
|
return defaultBenchmarkPowerSourceConfigPath
|
||||||
|
}
|
||||||
|
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var cfg BenchmarkPowerAutotuneConfig
|
||||||
|
if err := json.Unmarshal(raw, &cfg); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
||||||
|
return nil, fmt.Errorf("autotune config missing selected_source")
|
||||||
|
}
|
||||||
|
return &cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return fmt.Errorf("empty autotune config path")
|
||||||
|
}
|
||||||
|
if cfg.Version <= 0 {
|
||||||
|
cfg.Version = benchmarkPowerAutotuneVersion
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(cfg, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.Rename(tmp, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return fmt.Errorf("empty autotune config path")
|
||||||
|
}
|
||||||
|
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeBenchmarkPowerSource(source string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(source)) {
|
||||||
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
|
return BenchmarkPowerSourceSDRPSUInput
|
||||||
|
default:
|
||||||
|
return BenchmarkPowerSourceDCMI
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
||||||
|
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
||||||
|
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
||||||
|
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: true,
|
||||||
|
SelectedSource: selected,
|
||||||
|
EffectiveSource: selected,
|
||||||
|
Mode: "autotuned",
|
||||||
|
Reason: strings.TrimSpace(cfg.Reason),
|
||||||
|
ConfiguredAt: cfg.UpdatedAt,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sources := sampleBenchmarkPowerSources()
|
||||||
|
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: false,
|
||||||
|
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
||||||
|
Mode: "fallback",
|
||||||
|
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: false,
|
||||||
|
EffectiveSource: BenchmarkPowerSourceDCMI,
|
||||||
|
Mode: "fallback",
|
||||||
|
Reason: "autotune config not found; using temporary fallback source dcmi",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
||||||
|
decision := ResolveSystemPowerDecision(exportDir)
|
||||||
|
if decision.EffectiveSource != "" {
|
||||||
|
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
||||||
|
return value, decision, nil
|
||||||
|
} else if decision.Configured {
|
||||||
|
fallback := BenchmarkPowerSourceDCMI
|
||||||
|
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
||||||
|
fallback = BenchmarkPowerSourceSDRPSUInput
|
||||||
|
}
|
||||||
|
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
||||||
|
decision.Mode = "degraded"
|
||||||
|
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
||||||
|
decision.EffectiveSource = fallback
|
||||||
|
return fallbackValue, decision, nil
|
||||||
|
}
|
||||||
|
decision.Mode = "degraded"
|
||||||
|
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
||||||
|
return 0, decision, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, decision, fmt.Errorf("system power source unavailable")
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
||||||
|
switch normalizeBenchmarkPowerSource(source) {
|
||||||
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
|
sdr := sampleIPMISDRPowerSensors()
|
||||||
|
if sdr.PSUInW > 0 {
|
||||||
|
return sdr.PSUInW, nil
|
||||||
|
}
|
||||||
|
return 0, fmt.Errorf("sdr psu input unavailable")
|
||||||
|
default:
|
||||||
|
return queryIPMIServerPowerW()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleBenchmarkPowerSources() map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
||||||
|
out[BenchmarkPowerSourceDCMI] = w
|
||||||
|
}
|
||||||
|
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
||||||
|
out[BenchmarkPowerSourceSDRPSUInput] = w
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||||
|
}
|
||||||
|
close(stopCh)
|
||||||
|
return <-doneCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
||||||
|
if intervalSec <= 0 {
|
||||||
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||||
|
}
|
||||||
|
ch := make(chan []float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
var samples []float64
|
||||||
|
record := func() {
|
||||||
|
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record()
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- samples
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
record()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
|
type benchmarkPowerAutotuneSample struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
GPUAvgUsagePct float64
|
||||||
|
CPUUsagePct float64
|
||||||
|
GPUSumPowerW float64
|
||||||
|
Sources map[string]float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var out []benchmarkPowerAutotuneSample
|
||||||
|
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||||
|
start := time.Now()
|
||||||
|
for {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
row := benchmarkPowerAutotuneSample{
|
||||||
|
ElapsedSec: time.Since(start).Seconds(),
|
||||||
|
CPUUsagePct: sampleCPULoadPct(),
|
||||||
|
Sources: sampleBenchmarkPowerSources(),
|
||||||
|
}
|
||||||
|
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
||||||
|
var usageSum float64
|
||||||
|
for _, gpu := range gpuRows {
|
||||||
|
row.GPUSumPowerW += gpu.PowerW
|
||||||
|
usageSum += gpu.UsagePct
|
||||||
|
}
|
||||||
|
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
||||||
|
}
|
||||||
|
out = append(out, row)
|
||||||
|
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return out
|
||||||
|
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||||
|
if logFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var sourceParts []string
|
||||||
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||||
|
if value, ok := sample.Sources[source]; ok && value > 0 {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
||||||
|
} else {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
||||||
|
phase,
|
||||||
|
sample.ElapsedSec,
|
||||||
|
sample.GPUAvgUsagePct,
|
||||||
|
sample.GPUSumPowerW,
|
||||||
|
sample.CPUUsagePct,
|
||||||
|
strings.Join(sourceParts, " "),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||||
|
if logFunc == nil || len(samples) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var gpuUsage []float64
|
||||||
|
var cpuUsage []float64
|
||||||
|
var gpuPower []float64
|
||||||
|
sourceBuckets := map[string][]float64{}
|
||||||
|
for _, sample := range samples {
|
||||||
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||||
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||||
|
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
sourceBuckets[source] = append(sourceBuckets[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var sourceParts []string
|
||||||
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||||
|
values := sourceBuckets[source]
|
||||||
|
if len(values) == 0 {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
||||||
|
phase,
|
||||||
|
len(samples),
|
||||||
|
benchmarkMean(gpuUsage),
|
||||||
|
benchmarkPercentile(gpuUsage, 95),
|
||||||
|
benchmarkMean(gpuPower),
|
||||||
|
benchmarkMean(cpuUsage),
|
||||||
|
benchmarkPercentile(cpuUsage, 95),
|
||||||
|
strings.Join(sourceParts, " "),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
||||||
|
if logFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if !candidate.Available {
|
||||||
|
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
||||||
|
candidate.Source,
|
||||||
|
candidate.IdleAvgW,
|
||||||
|
candidate.LoadAvgW,
|
||||||
|
candidate.DeltaW,
|
||||||
|
gpuDelta,
|
||||||
|
candidate.RelativeError,
|
||||||
|
candidate.Confidence*100,
|
||||||
|
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
||||||
|
))
|
||||||
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||||
|
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
||||||
|
result := &BenchmarkPowerAutotuneValidation{}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
result.Reason = "no idle telemetry samples collected"
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
var gpuUsage []float64
|
||||||
|
var cpuUsage []float64
|
||||||
|
for _, sample := range samples {
|
||||||
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||||
|
if sample.CPUUsagePct > 0 {
|
||||||
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.GPUSamples = len(gpuUsage)
|
||||||
|
result.CPUSamples = len(cpuUsage)
|
||||||
|
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
||||||
|
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
||||||
|
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
||||||
|
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
||||||
|
switch {
|
||||||
|
case result.GPUAvgUsagePct > 5:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
||||||
|
case result.GPUP95UsagePct > 10:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
||||||
|
case result.CPUAvgUsagePct > 20:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
||||||
|
case result.CPUP95UsagePct > 35:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
||||||
|
default:
|
||||||
|
result.Valid = true
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
||||||
|
idleBySource := map[string][]float64{}
|
||||||
|
loadBySource := map[string][]float64{}
|
||||||
|
var idleGPU []float64
|
||||||
|
var loadGPU []float64
|
||||||
|
for _, sample := range idle {
|
||||||
|
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
idleBySource[source] = append(idleBySource[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, sample := range load {
|
||||||
|
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
loadBySource[source] = append(loadBySource[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
idleGPUAvg := benchmarkMean(idleGPU)
|
||||||
|
loadGPUAvg := benchmarkMean(loadGPU)
|
||||||
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||||
|
if gpuDelta <= 0 {
|
||||||
|
gpuDelta = loadGPUAvg
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates := []BenchmarkPowerAutotuneCandidate{
|
||||||
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
||||||
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
||||||
|
}
|
||||||
|
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if candidate.Available && candidate.DeltaW > 0 {
|
||||||
|
available = append(available, candidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(available) == 0 {
|
||||||
|
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
||||||
|
}
|
||||||
|
sort.Slice(available, func(i, j int) bool {
|
||||||
|
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
||||||
|
if available[i].Source != available[j].Source {
|
||||||
|
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if available[i].RelativeError != available[j].RelativeError {
|
||||||
|
return available[i].RelativeError < available[j].RelativeError
|
||||||
|
}
|
||||||
|
return available[i].Samples > available[j].Samples
|
||||||
|
})
|
||||||
|
selected := available[0]
|
||||||
|
for idx := range candidates {
|
||||||
|
if candidates[idx].Source == selected.Source {
|
||||||
|
candidates[idx].Selected = true
|
||||||
|
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
||||||
|
candidate := BenchmarkPowerAutotuneCandidate{
|
||||||
|
Source: source,
|
||||||
|
Available: len(idle) > 0 && len(load) > 0,
|
||||||
|
Samples: minInt(len(idle), len(load)),
|
||||||
|
}
|
||||||
|
if !candidate.Available {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
candidate.IdleAvgW = benchmarkMean(idle)
|
||||||
|
candidate.LoadAvgW = benchmarkMean(load)
|
||||||
|
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
||||||
|
if gpuDelta > 0 {
|
||||||
|
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
||||||
|
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
||||||
|
}
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
||||||
|
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
||||||
|
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
||||||
|
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
||||||
|
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
||||||
|
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
||||||
|
if result.SelectedSource != "" {
|
||||||
|
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
||||||
|
}
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
||||||
|
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
||||||
|
if result.IdleValidation.Reason != "" {
|
||||||
|
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, candidate := range result.Candidates {
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
||||||
|
if candidate.Available {
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
||||||
|
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
||||||
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
||||||
|
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
||||||
|
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
||||||
|
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
||||||
|
if result.SelectedSource != "" {
|
||||||
|
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
b.WriteString("## Idle Validation\n\n")
|
||||||
|
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
||||||
|
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
||||||
|
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
||||||
|
if result.IdleValidation.Reason != "" {
|
||||||
|
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
if len(result.Candidates) > 0 {
|
||||||
|
b.WriteString("## Candidates\n\n")
|
||||||
|
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
||||||
|
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
||||||
|
for _, candidate := range result.Candidates {
|
||||||
|
if !candidate.Available {
|
||||||
|
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
selected := "no"
|
||||||
|
if candidate.Selected {
|
||||||
|
selected = "yes"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
||||||
|
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range result.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
||||||
|
allDevices := joinIndexList(gpuIndices)
|
||||||
|
switch strings.TrimSpace(strings.ToLower(kind)) {
|
||||||
|
case "power-fit", "power", "nvidia-bench-power":
|
||||||
|
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
||||||
|
if err == nil {
|
||||||
|
return cmd, "power-fit"
|
||||||
|
}
|
||||||
|
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
||||||
|
default:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", fmt.Sprintf("%d", durationSec),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
if sizeMB > 0 {
|
||||||
|
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
||||||
|
}
|
||||||
|
return cmd, "performance"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = "/var/log/bee-bench/autotune"
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||||
|
}
|
||||||
|
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
hostname, _ := os.Hostname()
|
||||||
|
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
||||||
|
result := BenchmarkPowerAutotuneResult{
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
Hostname: hostname,
|
||||||
|
ServerModel: readServerModel(),
|
||||||
|
BenchmarkKind: normalizedKind,
|
||||||
|
Profile: opts.Profile,
|
||||||
|
Status: "FAILED",
|
||||||
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||||
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
||||||
|
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
||||||
|
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
||||||
|
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
||||||
|
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
result.IdleValidationError = result.IdleValidation.Reason
|
||||||
|
logFunc(result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
||||||
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
||||||
|
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
||||||
|
go func() {
|
||||||
|
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
||||||
|
}()
|
||||||
|
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
||||||
|
loadSamples := <-loadSamplesCh
|
||||||
|
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
||||||
|
if runErr != nil {
|
||||||
|
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
||||||
|
result.Candidates = candidates
|
||||||
|
result.GPUPowerIdleW = idleGPUAvg
|
||||||
|
result.GPUPowerLoadW = loadGPUAvg
|
||||||
|
if chooseErr != nil {
|
||||||
|
result.Notes = append(result.Notes, chooseErr.Error())
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, chooseErr
|
||||||
|
}
|
||||||
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||||
|
if gpuDelta <= 0 {
|
||||||
|
gpuDelta = loadGPUAvg
|
||||||
|
}
|
||||||
|
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
||||||
|
result.SelectedSource = selectedSource
|
||||||
|
result.Status = "OK"
|
||||||
|
var confidence float64
|
||||||
|
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if candidate.Selected {
|
||||||
|
confidence = candidate.Confidence
|
||||||
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||||
|
selectionReason = candidate.SelectionNotes
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cfg := BenchmarkPowerAutotuneConfig{
|
||||||
|
Version: benchmarkPowerAutotuneVersion,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
SelectedSource: selectedSource,
|
||||||
|
BenchmarkKind: normalizedKind,
|
||||||
|
Profile: opts.Profile,
|
||||||
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||||
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
Confidence: confidence,
|
||||||
|
Reason: selectionReason,
|
||||||
|
}
|
||||||
|
result.Config = &cfg
|
||||||
|
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
||||||
|
result.Status = "FAILED"
|
||||||
|
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
||||||
|
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
||||||
|
return "", writeErr
|
||||||
|
}
|
||||||
|
return runDir, err
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
||||||
|
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
||||||
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("marshal autotune result: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune result.json: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune summary.txt: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune report.md: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func minInt(a, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ = exec.ErrNotFound
|
||||||
558
audit/internal/platform/benchmark_report.go
Normal file
558
audit/internal/platform/benchmark_report.go
Normal file
@@ -0,0 +1,558 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||||
|
return renderBenchmarkReportWithCharts(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// ── Header ────────────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||||
|
|
||||||
|
// System identity block
|
||||||
|
if result.ServerModel != "" {
|
||||||
|
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||||
|
}
|
||||||
|
if result.Hostname != "" {
|
||||||
|
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||||
|
}
|
||||||
|
// GPU models summary
|
||||||
|
if len(result.GPUs) > 0 {
|
||||||
|
modelCount := make(map[string]int)
|
||||||
|
var modelOrder []string
|
||||||
|
for _, g := range result.GPUs {
|
||||||
|
m := strings.TrimSpace(g.Name)
|
||||||
|
if m == "" {
|
||||||
|
m = "Unknown GPU"
|
||||||
|
}
|
||||||
|
if modelCount[m] == 0 {
|
||||||
|
modelOrder = append(modelOrder, m)
|
||||||
|
}
|
||||||
|
modelCount[m]++
|
||||||
|
}
|
||||||
|
var parts []string
|
||||||
|
for _, m := range modelOrder {
|
||||||
|
if modelCount[m] == 1 {
|
||||||
|
parts = append(parts, m)
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||||
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
|
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||||
|
if result.RampRunID != "" {
|
||||||
|
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||||||
|
}
|
||||||
|
} else if result.ParallelGPUs {
|
||||||
|
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||||
|
}
|
||||||
|
if result.ScalabilityScore > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||||
|
}
|
||||||
|
if result.PlatformPowerScore > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n", result.PlatformPowerScore)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||||
|
if len(result.Findings) > 0 {
|
||||||
|
b.WriteString("## Executive Summary\n\n")
|
||||||
|
for _, finding := range result.Findings {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", finding)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(result.Warnings) > 0 {
|
||||||
|
b.WriteString("## Warnings\n\n")
|
||||||
|
for _, warning := range result.Warnings {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", warning)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Balanced Scorecard ────────────────────────────────────────────────────
|
||||||
|
b.WriteString("## Balanced Scorecard\n\n")
|
||||||
|
|
||||||
|
// Perspective 1: Compatibility — hard stops
|
||||||
|
b.WriteString("### 1. Compatibility\n\n")
|
||||||
|
{
|
||||||
|
var rows [][]string
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
thermalThrottle := "-"
|
||||||
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
|
}
|
||||||
|
fanAtThrottle := "-"
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
ecc := "-"
|
||||||
|
if gpu.ECC.Uncorrected > 0 {
|
||||||
|
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||||
|
}
|
||||||
|
compatStatus := "✓ OK"
|
||||||
|
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||||
|
compatStatus = "⛔ HARD STOP"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perspective 2: Thermal headroom
|
||||||
|
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||||
|
{
|
||||||
|
var rows [][]string
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
shutdownTemp := gpu.ShutdownTempC
|
||||||
|
if shutdownTemp <= 0 {
|
||||||
|
shutdownTemp = 90
|
||||||
|
}
|
||||||
|
slowdownTemp := gpu.SlowdownTempC
|
||||||
|
if slowdownTemp <= 0 {
|
||||||
|
slowdownTemp = 80
|
||||||
|
}
|
||||||
|
headroom := gpu.Scores.TempHeadroomC
|
||||||
|
thermalStatus := "✓ OK"
|
||||||
|
switch {
|
||||||
|
case headroom < 10:
|
||||||
|
thermalStatus = "⛔ CRITICAL"
|
||||||
|
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||||
|
thermalStatus = "⚠ WARNING"
|
||||||
|
}
|
||||||
|
throttlePct := "-"
|
||||||
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||||
|
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||||
|
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||||
|
fmt.Sprintf("%.1f°C", headroom),
|
||||||
|
throttlePct,
|
||||||
|
thermalStatus,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perspective 3: Power delivery
|
||||||
|
b.WriteString("### 3. Power Delivery\n\n")
|
||||||
|
{
|
||||||
|
var rows [][]string
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
powerCap := "-"
|
||||||
|
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||||
|
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||||
|
}
|
||||||
|
fanDuty := "-"
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||||
|
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
powerStatus := "✓ OK"
|
||||||
|
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||||
|
powerStatus = "⚠ POWER LIMITED"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
powerCap,
|
||||||
|
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||||
|
fanDuty,
|
||||||
|
powerStatus,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perspective 4: Performance
|
||||||
|
b.WriteString("### 4. Performance\n\n")
|
||||||
|
{
|
||||||
|
var rows [][]string
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
synthetic := "-"
|
||||||
|
if gpu.Scores.SyntheticScore > 0 {
|
||||||
|
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||||
|
}
|
||||||
|
mixed := "-"
|
||||||
|
if gpu.Scores.MixedScore > 0 {
|
||||||
|
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||||
|
}
|
||||||
|
mixedEff := "-"
|
||||||
|
if gpu.Scores.MixedEfficiency > 0 {
|
||||||
|
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||||
|
}
|
||||||
|
topsPerSM := "-"
|
||||||
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
|
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||||
|
synthetic, mixed, mixedEff, topsPerSM,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||||
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
|
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perspective 5: Anomaly flags
|
||||||
|
b.WriteString("### 5. Anomalies\n\n")
|
||||||
|
{
|
||||||
|
var rows [][]string
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
eccCorr := "-"
|
||||||
|
if gpu.ECC.Corrected > 0 {
|
||||||
|
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||||
|
}
|
||||||
|
syncBoost := "-"
|
||||||
|
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||||
|
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||||
|
}
|
||||||
|
powerVar := "OK"
|
||||||
|
if gpu.Scores.PowerSustainScore < 70 {
|
||||||
|
powerVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
thermalVar := "OK"
|
||||||
|
if gpu.Scores.ThermalSustainScore < 70 {
|
||||||
|
thermalVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("## Per-GPU Details\n\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
name := strings.TrimSpace(gpu.Name)
|
||||||
|
if name == "" {
|
||||||
|
name = "Unknown GPU"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||||
|
|
||||||
|
// Identity
|
||||||
|
if gpu.BusID != "" {
|
||||||
|
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||||
|
}
|
||||||
|
if gpu.VBIOS != "" {
|
||||||
|
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||||
|
}
|
||||||
|
if gpu.ComputeCapability != "" {
|
||||||
|
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||||
|
}
|
||||||
|
if gpu.MultiprocessorCount > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||||
|
}
|
||||||
|
if gpu.PowerLimitW > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
if gpu.PowerLimitDerated {
|
||||||
|
fmt.Fprintf(&b, "- **Power limit derating:** active (reduced limit %.0f W)\n", gpu.PowerLimitW)
|
||||||
|
}
|
||||||
|
if gpu.CalibratedPeakPowerW > 0 {
|
||||||
|
if gpu.CalibratedPeakTempC > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, "- **Calibrated peak power:** %.0f W p95\n", gpu.CalibratedPeakPowerW)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if gpu.LockedGraphicsClockMHz > 0 {
|
||||||
|
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
|
// Steady-state telemetry
|
||||||
|
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||||
|
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||||
|
b.WriteString(fmtMDTable(
|
||||||
|
[]string{"", "Avg", "P95"},
|
||||||
|
[][]string{
|
||||||
|
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||||
|
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||||
|
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||||
|
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||||
|
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||||
|
},
|
||||||
|
))
|
||||||
|
b.WriteString("\n")
|
||||||
|
} else {
|
||||||
|
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-precision stability phases.
|
||||||
|
if len(gpu.PrecisionSteady) > 0 {
|
||||||
|
b.WriteString("**Per-precision stability:**\n\n")
|
||||||
|
var precRows [][]string
|
||||||
|
for _, p := range gpu.PrecisionSteady {
|
||||||
|
eccCorr := "—"
|
||||||
|
eccUncorr := "—"
|
||||||
|
if !p.ECC.IsZero() {
|
||||||
|
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||||
|
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||||
|
}
|
||||||
|
status := p.Status
|
||||||
|
if strings.TrimSpace(status) == "" {
|
||||||
|
status = "OK"
|
||||||
|
}
|
||||||
|
precRows = append(precRows, []string{
|
||||||
|
p.Precision, status,
|
||||||
|
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||||
|
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||||
|
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||||
|
eccCorr, eccUncorr,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
} else {
|
||||||
|
// Legacy: show combined-window variance.
|
||||||
|
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||||||
|
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ECC summary
|
||||||
|
if !gpu.ECC.IsZero() {
|
||||||
|
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||||||
|
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Throttle
|
||||||
|
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||||
|
if throttle != "none" {
|
||||||
|
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Precision results
|
||||||
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
|
b.WriteString("**Precision results:**\n\n")
|
||||||
|
var presRows [][]string
|
||||||
|
for _, p := range gpu.PrecisionResults {
|
||||||
|
if p.Supported {
|
||||||
|
presRows = append(presRows, []string{
|
||||||
|
p.Name,
|
||||||
|
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||||
|
fmt.Sprintf("×%.3g", p.Weight),
|
||||||
|
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||||
|
fmt.Sprintf("%d", p.Lanes),
|
||||||
|
fmt.Sprintf("%d", p.Iterations),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Degradation / Notes
|
||||||
|
if len(gpu.DegradationReasons) > 0 {
|
||||||
|
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||||
|
}
|
||||||
|
if len(gpu.Notes) > 0 {
|
||||||
|
b.WriteString("**Notes:**\n\n")
|
||||||
|
for _, note := range gpu.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||||
|
if result.Interconnect.Supported {
|
||||||
|
b.WriteString(fmtMDTable(
|
||||||
|
[]string{"Metric", "Avg", "Max"},
|
||||||
|
[][]string{
|
||||||
|
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||||
|
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||||
|
},
|
||||||
|
))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range result.Interconnect.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
if len(result.Interconnect.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Server Power ───────────────────────────────────────────────────────────
|
||||||
|
if sp := result.ServerPower; sp != nil {
|
||||||
|
title := "## Server Power\n\n"
|
||||||
|
if sp.Source != "" {
|
||||||
|
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
|
||||||
|
}
|
||||||
|
b.WriteString(title)
|
||||||
|
if !sp.Available {
|
||||||
|
b.WriteString("Server power measurement unavailable.\n\n")
|
||||||
|
} else {
|
||||||
|
spRows := [][]string{
|
||||||
|
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||||
|
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||||
|
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||||
|
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||||
|
}
|
||||||
|
if sp.ReportingRatio > 0 {
|
||||||
|
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range sp.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
if len(sp.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||||
|
if len(result.PSUIssues) > 0 {
|
||||||
|
b.WriteString("## PSU Issues\n\n")
|
||||||
|
b.WriteString("The following power supply anomalies were detected during the benchmark:\n\n")
|
||||||
|
for _, issue := range result.PSUIssues {
|
||||||
|
fmt.Fprintf(&b, "- ⛔ %s\n", issue)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||||
|
if cooling := result.Cooling; cooling != nil {
|
||||||
|
b.WriteString("## Cooling\n\n")
|
||||||
|
if cooling.Available {
|
||||||
|
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||||
|
if cooling.FanDutyCycleAvailable {
|
||||||
|
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||||
|
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable(
|
||||||
|
[]string{"Metric", "Value"},
|
||||||
|
[][]string{
|
||||||
|
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||||
|
{"Average fan duty cycle", dutyAvg},
|
||||||
|
{"P95 fan duty cycle", dutyP95},
|
||||||
|
},
|
||||||
|
))
|
||||||
|
b.WriteString("\n")
|
||||||
|
} else {
|
||||||
|
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||||
|
}
|
||||||
|
for _, note := range cooling.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
if len(cooling.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Platform Scalability ──────────────────────────────────────────────────
|
||||||
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
|
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||||
|
var scalRows [][]string
|
||||||
|
for _, step := range result.PerformanceRampSteps {
|
||||||
|
scalRows = append(scalRows, []string{
|
||||||
|
fmt.Sprintf("%d", step.StepIndex),
|
||||||
|
joinIndexList(step.GPUIndices),
|
||||||
|
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||||
|
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||||
|
b.WriteString("## Raw Files\n\n")
|
||||||
|
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||||
|
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||||
|
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||||
|
// duration is unknown (0), raw seconds are shown instead.
|
||||||
|
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||||
|
type counter struct {
|
||||||
|
label string
|
||||||
|
us uint64
|
||||||
|
}
|
||||||
|
counters := []counter{
|
||||||
|
{"sw_power", t.SWPowerCapUS},
|
||||||
|
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||||
|
{"sync_boost", t.SyncBoostUS},
|
||||||
|
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||||
|
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||||
|
}
|
||||||
|
var parts []string
|
||||||
|
for _, c := range counters {
|
||||||
|
if c.us == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sec := float64(c.us) / 1e6
|
||||||
|
if steadyDurationSec > 0 {
|
||||||
|
pct := sec / steadyDurationSec * 100
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||||
|
} else if sec < 1 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(parts) == 0 {
|
||||||
|
return "none"
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||||
|
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||||
|
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||||||
|
var best float64
|
||||||
|
for i, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||||||
|
if i == 0 || gpu.Scores.CompositeScore > best {
|
||||||
|
best = gpu.Scores.CompositeScore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||||||
|
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||||
|
// is readable as plain text without a markdown renderer.
|
||||||
|
//
|
||||||
|
// headers contains the column header strings.
|
||||||
|
// rows contains data rows; each row must have the same number of cells as headers.
|
||||||
|
// Cells with fewer entries than headers are treated as empty.
|
||||||
|
func fmtMDTable(headers []string, rows [][]string) string {
|
||||||
|
ncols := len(headers)
|
||||||
|
if ncols == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute max width per column.
|
||||||
|
widths := make([]int, ncols)
|
||||||
|
for i, h := range headers {
|
||||||
|
if len(h) > widths[i] {
|
||||||
|
widths[i] = len(h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, row := range rows {
|
||||||
|
for i := 0; i < ncols; i++ {
|
||||||
|
cell := ""
|
||||||
|
if i < len(row) {
|
||||||
|
cell = row[i]
|
||||||
|
}
|
||||||
|
if len(cell) > widths[i] {
|
||||||
|
widths[i] = len(cell)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// Header row.
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i, h := range headers {
|
||||||
|
b.WriteByte(' ')
|
||||||
|
b.WriteString(h)
|
||||||
|
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||||
|
b.WriteString(" |")
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
|
||||||
|
// Separator row.
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i := range headers {
|
||||||
|
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||||
|
b.WriteByte('|')
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
|
||||||
|
// Data rows.
|
||||||
|
for _, row := range rows {
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i := 0; i < ncols; i++ {
|
||||||
|
cell := ""
|
||||||
|
if i < len(row) {
|
||||||
|
cell = row[i]
|
||||||
|
}
|
||||||
|
b.WriteByte(' ')
|
||||||
|
b.WriteString(cell)
|
||||||
|
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||||
|
b.WriteString(" |")
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
582
audit/internal/platform/benchmark_test.go
Normal file
582
audit/internal/platform/benchmark_test.go
Normal file
@@ -0,0 +1,582 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
profile string
|
||||||
|
want benchmarkProfileSpec
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "default",
|
||||||
|
profile: "",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "stability",
|
||||||
|
profile: "stability",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight",
|
||||||
|
profile: "overnight",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got := resolveBenchmarkProfile(tc.profile)
|
||||||
|
if got != tc.want {
|
||||||
|
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||||
|
benchmarkPrecisionPhases,
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if len(labels) != 5 || len(phases) != 5 {
|
||||||
|
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
|
||||||
|
}
|
||||||
|
if basePhaseSec != 60 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 300 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||||
|
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||||
|
benchmarkPrecisionPhases,
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if basePhaseSec != 300 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 3600 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||||
|
benchmarkPrecisionPhases,
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if basePhaseSec != 3600 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 14400 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
phases := []benchmarkPlannedPhase{
|
||||||
|
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||||
|
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||||
|
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||||
|
}
|
||||||
|
rows := []GPUMetricRow{
|
||||||
|
{ElapsedSec: 5},
|
||||||
|
{ElapsedSec: 15},
|
||||||
|
{ElapsedSec: 25},
|
||||||
|
{ElapsedSec: 65},
|
||||||
|
}
|
||||||
|
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||||
|
if len(got["fp8"]) != 1 {
|
||||||
|
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||||
|
}
|
||||||
|
if len(got["fp16"]) != 1 {
|
||||||
|
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||||
|
}
|
||||||
|
if len(got["mixed"]) != 2 {
|
||||||
|
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||||
|
t.Fatalf("supported=%v", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
|
||||||
|
t.Fatalf("supported=%v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
raw string
|
||||||
|
wantStatus string
|
||||||
|
}{
|
||||||
|
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
|
||||||
|
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
|
||||||
|
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
|
||||||
|
if got != tc.wantStatus {
|
||||||
|
t.Fatalf("status=%q want %q", got, tc.wantStatus)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
before := BenchmarkThrottleCounters{}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWPowerCapUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("sw_power_cap should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWPowerBrakeSlowdownUS: 1_000_000}); got != "" {
|
||||||
|
t.Fatalf("hw_power_brake should be ignored, got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{HWThermalSlowdownUS: 1_000_000}); got != "hw_thermal" {
|
||||||
|
t.Fatalf("hw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
if got := benchmarkCalibrationThrottleReason(before, BenchmarkThrottleCounters{SWThermalSlowdownUS: 1_000_000}); got != "sw_thermal" {
|
||||||
|
t.Fatalf("sw_thermal mismatch: got %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 1000 }
|
||||||
|
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||||
|
t.Fatal("unexpected reset call")
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
var logs []string
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{0, 2}, func(line string) {
|
||||||
|
logs = append(logs, line)
|
||||||
|
})
|
||||||
|
if got, want := strings.Join(logs, "\n"), "power benchmark pre-flight: root privileges unavailable, GPU reset skipped"; !strings.Contains(got, want) {
|
||||||
|
t.Fatalf("logs=%q want substring %q", got, want)
|
||||||
|
}
|
||||||
|
if len(failed) != 2 || failed[0] != 0 || failed[1] != 2 {
|
||||||
|
t.Fatalf("failed=%v want [0 2]", failed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
var calls []int
|
||||||
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
|
calls = append(calls, index)
|
||||||
|
return "ok\n", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if len(failed) != 0 {
|
||||||
|
t.Fatalf("failed=%v want no failures", failed)
|
||||||
|
}
|
||||||
|
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||||
|
t.Fatalf("calls=%v want %s", calls, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||||
|
oldGeteuid := benchmarkGeteuid
|
||||||
|
oldSleep := benchmarkSleep
|
||||||
|
oldReset := benchmarkResetNvidiaGPU
|
||||||
|
benchmarkGeteuid = func() int { return 0 }
|
||||||
|
benchmarkSleep = func(time.Duration) {}
|
||||||
|
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||||
|
if index == 5 {
|
||||||
|
return "busy\n", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
return "ok\n", nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() {
|
||||||
|
benchmarkGeteuid = oldGeteuid
|
||||||
|
benchmarkSleep = oldSleep
|
||||||
|
benchmarkResetNvidiaGPU = oldReset
|
||||||
|
})
|
||||||
|
|
||||||
|
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||||
|
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||||
|
t.Fatalf("failed=%v want %s", failed, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
|
||||||
|
Profile: "stability",
|
||||||
|
RunNCCL: false,
|
||||||
|
})
|
||||||
|
if opts.Profile != NvidiaBenchmarkProfileStability {
|
||||||
|
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
|
||||||
|
}
|
||||||
|
if opts.RunNCCL {
|
||||||
|
t.Fatalf("RunNCCL should stay false when explicitly disabled")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInitialBenchmarkCalibrationLimitW(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
info benchmarkGPUInfo
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "prefers default tdp over current derated limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 600,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 600,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "caps default tdp to reported max limit",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 500,
|
||||||
|
DefaultPowerLimitW: 700,
|
||||||
|
MaxPowerLimitW: 650,
|
||||||
|
},
|
||||||
|
want: 650,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to current limit when default missing",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
PowerLimitW: 525,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
want: 525,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "falls back to max limit when only that is known",
|
||||||
|
info: benchmarkGPUInfo{
|
||||||
|
MaxPowerLimitW: 575,
|
||||||
|
},
|
||||||
|
want: 575,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
if got := initialBenchmarkCalibrationLimitW(tc.info); got != tc.want {
|
||||||
|
t.Fatalf("initialBenchmarkCalibrationLimitW(%+v)=%d want %d", tc.info, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := strings.Join([]string{
|
||||||
|
"loader=bee-gpu-burn",
|
||||||
|
"[gpu 0] device=NVIDIA H100",
|
||||||
|
"[gpu 0] compute_capability=9.0",
|
||||||
|
"[gpu 0] backend=cublasLt",
|
||||||
|
"[gpu 0] duration_s=10",
|
||||||
|
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||||
|
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] int8_tensor_iterations=80",
|
||||||
|
"[gpu 0] fp16_tensor_iterations=200",
|
||||||
|
"[gpu 0] fp8_e4m3_iterations=50",
|
||||||
|
"[gpu 0] status=OK",
|
||||||
|
}, "\n")
|
||||||
|
|
||||||
|
got := parseBenchmarkBurnLog(raw)
|
||||||
|
if got.Backend != "cublasLt" {
|
||||||
|
t.Fatalf("backend=%q want cublasLt", got.Backend)
|
||||||
|
}
|
||||||
|
if got.ComputeCapability != "9.0" {
|
||||||
|
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||||
|
}
|
||||||
|
if len(got.Profiles) != 3 {
|
||||||
|
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||||
|
}
|
||||||
|
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||||
|
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||||
|
}
|
||||||
|
if got.Profiles[0].Category != "fp16_bf16" {
|
||||||
|
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||||
|
}
|
||||||
|
if got.Profiles[1].Category != "fp8" {
|
||||||
|
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||||
|
}
|
||||||
|
if got.Profiles[2].Category != "int8" {
|
||||||
|
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||||
|
}
|
||||||
|
if got.Profiles[2].Weight != 0.25 {
|
||||||
|
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
result := NvidiaBenchmarkResult{
|
||||||
|
BenchmarkVersion: benchmarkVersion,
|
||||||
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
|
OverallStatus: "PARTIAL",
|
||||||
|
SelectedGPUIndices: []int{0},
|
||||||
|
Normalization: BenchmarkNormalization{
|
||||||
|
Status: "partial",
|
||||||
|
},
|
||||||
|
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
|
||||||
|
GPUs: []BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100",
|
||||||
|
Status: "OK",
|
||||||
|
Steady: BenchmarkTelemetrySummary{
|
||||||
|
AvgPowerW: 680,
|
||||||
|
AvgTempC: 79,
|
||||||
|
AvgGraphicsClockMHz: 1725,
|
||||||
|
P95PowerW: 700,
|
||||||
|
P95TempC: 82,
|
||||||
|
P95GraphicsClockMHz: 1800,
|
||||||
|
},
|
||||||
|
Scores: BenchmarkScorecard{
|
||||||
|
ComputeScore: 1200,
|
||||||
|
PowerSustainScore: 96,
|
||||||
|
ThermalSustainScore: 88,
|
||||||
|
StabilityScore: 92,
|
||||||
|
CompositeScore: 1176,
|
||||||
|
},
|
||||||
|
PrecisionResults: []BenchmarkPrecisionResult{
|
||||||
|
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
|
||||||
|
},
|
||||||
|
Throttle: BenchmarkThrottleCounters{
|
||||||
|
SWPowerCapUS: 1000000,
|
||||||
|
},
|
||||||
|
DegradationReasons: []string{"power_capped"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Cooling: &BenchmarkCoolingSummary{
|
||||||
|
Available: true,
|
||||||
|
AvgFanRPM: 9200,
|
||||||
|
FanDutyCycleAvailable: true,
|
||||||
|
AvgFanDutyCyclePct: 47.5,
|
||||||
|
P95FanDutyCyclePct: 62.0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
report := renderBenchmarkReport(result)
|
||||||
|
for _, needle := range []string{
|
||||||
|
"Executive Summary",
|
||||||
|
"GPU 0 spent measurable time under SW power cap.",
|
||||||
|
"1176.00",
|
||||||
|
"fp16_tensor",
|
||||||
|
"700.00",
|
||||||
|
"Cooling",
|
||||||
|
"Average fan duty cycle",
|
||||||
|
"47.5%",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(report, needle) {
|
||||||
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
report := renderBenchmarkReport(NvidiaBenchmarkResult{
|
||||||
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
|
OverallStatus: "OK",
|
||||||
|
SelectedGPUIndices: []int{0},
|
||||||
|
Normalization: BenchmarkNormalization{
|
||||||
|
Status: "full",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, needle := range []string{
|
||||||
|
"gpu-metrics.csv",
|
||||||
|
"gpu-metrics.html",
|
||||||
|
"gpu-burn.log",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(report, needle) {
|
||||||
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
|
||||||
|
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
|
||||||
|
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
|
||||||
|
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
|
||||||
|
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
|
||||||
|
},
|
||||||
|
PrecisionResults: []BenchmarkPrecisionResult{
|
||||||
|
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
|
||||||
|
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||||
|
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
if score.SyntheticScore != 100 {
|
||||||
|
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
|
||||||
|
}
|
||||||
|
if score.MixedScore != 50 {
|
||||||
|
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
nvsmiQ := []byte(`
|
||||||
|
GPU 00000000:4E:00.0
|
||||||
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Min Power Limit : 200.00 W
|
||||||
|
Max Power Limit : 600.00 W
|
||||||
|
Default Power Limit : 575.00 W
|
||||||
|
Current Power Limit : 560.00 W
|
||||||
|
Clocks
|
||||||
|
Graphics : 2422 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 2430 MHz
|
||||||
|
SM : 2430 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
Video : 2107 MHz
|
||||||
|
|
||||||
|
GPU 00000000:4F:00.0
|
||||||
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 2430 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
`)
|
||||||
|
|
||||||
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
|
0: {Index: 0, BusID: "00000000:4E:00.0"},
|
||||||
|
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||||
|
}
|
||||||
|
|
||||||
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
|
||||||
|
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||||
|
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MaxPowerLimitW != 600 {
|
||||||
|
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].DefaultPowerLimitW != 575 {
|
||||||
|
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].PowerLimitW != 560 {
|
||||||
|
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
nvsmiQ := []byte(`
|
||||||
|
GPU 00000000:4E:00.0
|
||||||
|
Min Power Limit : 100.00 W
|
||||||
|
Max Power Limit : 900.00 W
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 9999 MHz
|
||||||
|
Memory : 9999 MHz
|
||||||
|
`)
|
||||||
|
// Already populated — must not be overwritten.
|
||||||
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
|
0: {
|
||||||
|
Index: 0,
|
||||||
|
BusID: "00000000:4E:00.0",
|
||||||
|
MaxGraphicsClockMHz: 2430,
|
||||||
|
MaxMemoryClockMHz: 12481,
|
||||||
|
MinPowerLimitW: 200,
|
||||||
|
MaxPowerLimitW: 600,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MinPowerLimitW != 200 {
|
||||||
|
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
|
||||||
|
}
|
||||||
|
}
|
||||||
536
audit/internal/platform/benchmark_types.go
Normal file
536
audit/internal/platform/benchmark_types.go
Normal file
@@ -0,0 +1,536 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||||
|
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||||
|
type BenchmarkHostConfig struct {
|
||||||
|
CPUModel string `json:"cpu_model,omitempty"`
|
||||||
|
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||||
|
CPUCores int `json:"cpu_cores,omitempty"`
|
||||||
|
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||||
|
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||||
|
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||||
|
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||||
|
type BenchmarkCPULoad struct {
|
||||||
|
AvgPct float64 `json:"avg_pct"`
|
||||||
|
MaxPct float64 `json:"max_pct"`
|
||||||
|
P95Pct float64 `json:"p95_pct"`
|
||||||
|
Samples int `json:"samples"`
|
||||||
|
// Status is "ok", "high", or "unstable".
|
||||||
|
Status string `json:"status"`
|
||||||
|
Note string `json:"note,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
|
||||||
|
// benchmark run.
|
||||||
|
type BenchmarkCoolingSummary struct {
|
||||||
|
Available bool `json:"available"`
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
|
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
|
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
NvidiaBenchmarkProfileStandard = "standard"
|
||||||
|
NvidiaBenchmarkProfileStability = "stability"
|
||||||
|
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
|
||||||
|
BenchmarkPowerEngineTargetedPower = "targeted_power"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||||
|
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||||
|
// re-measure from actual task logs and update the constants here.
|
||||||
|
//
|
||||||
|
// Sources:
|
||||||
|
// - BenchmarkEstimatedPerfStandardSec: MLT v8.22 ramp 1-4: 927 s; xFusion v8.22 parallel 8GPU: 1080 s
|
||||||
|
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||||
|
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||||
|
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||||
|
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||||
|
const (
|
||||||
|
// Performance Benchmark (bee-gpu-burn).
|
||||||
|
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||||
|
// Sequential per-GPU mode scales approximately linearly.
|
||||||
|
BenchmarkEstimatedPerfStandardSec = 960 // ~16 min; ramp-up 1-4: 927 s, parallel 8GPU: 1080 s
|
||||||
|
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||||
|
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||||
|
|
||||||
|
// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
|
||||||
|
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||||
|
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||||
|
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||||
|
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaBenchmarkOptions struct {
|
||||||
|
Profile string
|
||||||
|
SizeMB int
|
||||||
|
GPUIndices []int
|
||||||
|
ExcludeGPUIndices []int
|
||||||
|
RunNCCL bool
|
||||||
|
ServerPowerSource string
|
||||||
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
|
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||||
|
RampTotal int // total number of ramp-up steps in this run
|
||||||
|
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
BenchmarkPowerSourceDCMI = "dcmi"
|
||||||
|
BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneConfig struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
SelectedSource string `json:"selected_source"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
|
Profile string `json:"profile,omitempty"`
|
||||||
|
IdleDurationSec int `json:"idle_duration_sec,omitempty"`
|
||||||
|
LoadDurationSec int `json:"load_duration_sec,omitempty"`
|
||||||
|
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||||
|
Confidence float64 `json:"confidence,omitempty"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SystemPowerSourceDecision struct {
|
||||||
|
Configured bool `json:"configured"`
|
||||||
|
SelectedSource string `json:"selected_source,omitempty"`
|
||||||
|
EffectiveSource string `json:"effective_source,omitempty"`
|
||||||
|
Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
ConfiguredAt time.Time `json:"configured_at,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneResult struct {
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
|
Profile string `json:"profile,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
IdleDurationSec int `json:"idle_duration_sec"`
|
||||||
|
LoadDurationSec int `json:"load_duration_sec"`
|
||||||
|
SampleIntervalSec int `json:"sample_interval_sec"`
|
||||||
|
SelectedSource string `json:"selected_source,omitempty"`
|
||||||
|
IdleValidationError string `json:"idle_validation_error,omitempty"`
|
||||||
|
IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
|
||||||
|
GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"`
|
||||||
|
GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"`
|
||||||
|
Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneValidation struct {
|
||||||
|
Valid bool `json:"valid"`
|
||||||
|
GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
|
||||||
|
GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
|
||||||
|
CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
|
||||||
|
CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
|
||||||
|
GPUSamples int `json:"gpu_samples,omitempty"`
|
||||||
|
CPUSamples int `json:"cpu_samples,omitempty"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneCandidate struct {
|
||||||
|
Source string `json:"source"`
|
||||||
|
IdleAvgW float64 `json:"idle_avg_w,omitempty"`
|
||||||
|
LoadAvgW float64 `json:"load_avg_w,omitempty"`
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"`
|
||||||
|
Samples int `json:"samples,omitempty"`
|
||||||
|
RelativeError float64 `json:"relative_error,omitempty"`
|
||||||
|
Confidence float64 `json:"confidence,omitempty"`
|
||||||
|
Selected bool `json:"selected,omitempty"`
|
||||||
|
Available bool `json:"available"`
|
||||||
|
SelectionNotes string `json:"selection_notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaBenchmarkResult struct {
|
||||||
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
|
RampStep int `json:"ramp_step,omitempty"`
|
||||||
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
|
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||||
|
// PlatformPowerScore is the mean compute scalability across ramp steps 2..N.
|
||||||
|
// 100% = each added GPU contributes exactly its single-card throughput.
|
||||||
|
// < 100% = throughput loss due to thermal throttle, power limits, or contention.
|
||||||
|
PlatformPowerScore float64 `json:"platform_power_score,omitempty"`
|
||||||
|
PerformanceRampSteps []NvidiaPerformanceRampStep `json:"performance_ramp_steps,omitempty"`
|
||||||
|
OverallStatus string `json:"overall_status"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
|
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||||
|
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||||
|
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||||
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
|
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||||
|
// sensor states before and after the benchmark run. Empty when IPMI is
|
||||||
|
// unavailable or no PSU faults occurred during the test.
|
||||||
|
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkNormalization struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkNormalizationGPU struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
PersistenceMode string `json:"persistence_mode,omitempty"`
|
||||||
|
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
|
||||||
|
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
|
||||||
|
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
|
||||||
|
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkGPUResult struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
UUID string `json:"uuid,omitempty"`
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
BusID string `json:"bus_id,omitempty"`
|
||||||
|
VBIOS string `json:"vbios,omitempty"`
|
||||||
|
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||||
|
Backend string `json:"backend,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||||
|
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
|
||||||
|
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||||
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
// ShutdownTempC is the hardware thermal shutdown threshold for this GPU,
|
||||||
|
// sourced from nvidia-smi -q ("GPU Shutdown Temp"). Fallback: 90°C.
|
||||||
|
ShutdownTempC float64 `json:"shutdown_temp_c,omitempty"`
|
||||||
|
// SlowdownTempC is the software throttle onset threshold ("GPU Slowdown Temp").
|
||||||
|
// Fallback: 80°C.
|
||||||
|
SlowdownTempC float64 `json:"slowdown_temp_c,omitempty"`
|
||||||
|
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||||
|
// dcgmi targeted_power calibration run before the main benchmark.
|
||||||
|
// Used as the reference denominator for PowerSustainScore instead of
|
||||||
|
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||||
|
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||||
|
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
|
||||||
|
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
|
||||||
|
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||||
|
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||||
|
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||||
|
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||||
|
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||||
|
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||||
|
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||||
|
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||||
|
PrecisionFailures []string `json:"precision_failures,omitempty"`
|
||||||
|
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||||
|
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||||
|
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||||
|
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||||
|
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||||
|
Scores BenchmarkScorecard `json:"scores"`
|
||||||
|
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// CoolingWarning is non-empty when a thermal throttle event occurred with
|
||||||
|
// a clock drop ≥20% while server fans were not at 100% duty cycle.
|
||||||
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkTelemetrySummary struct {
|
||||||
|
DurationSec float64 `json:"duration_sec"`
|
||||||
|
Samples int `json:"samples"`
|
||||||
|
AvgTempC float64 `json:"avg_temp_c"`
|
||||||
|
P95TempC float64 `json:"p95_temp_c"`
|
||||||
|
AvgPowerW float64 `json:"avg_power_w"`
|
||||||
|
P95PowerW float64 `json:"p95_power_w"`
|
||||||
|
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
|
||||||
|
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
|
||||||
|
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
|
||||||
|
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
|
||||||
|
AvgUsagePct float64 `json:"avg_usage_pct"`
|
||||||
|
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
|
||||||
|
ClockCVPct float64 `json:"clock_cv_pct"`
|
||||||
|
PowerCVPct float64 `json:"power_cv_pct"`
|
||||||
|
TempCVPct float64 `json:"temp_cv_pct"`
|
||||||
|
ClockDriftPct float64 `json:"clock_drift_pct"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkThrottleCounters struct {
|
||||||
|
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
|
||||||
|
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
|
||||||
|
SyncBoostUS uint64 `json:"sync_boost_us"`
|
||||||
|
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
|
||||||
|
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
|
||||||
|
// Corrected = single-bit errors fixed by ECC (DRAM degradation).
|
||||||
|
// Uncorrected = double-bit errors that could not be corrected (serious fault).
|
||||||
|
// Both are volatile (since last driver reset), not persistent.
|
||||||
|
type BenchmarkECCCounters struct {
|
||||||
|
Corrected uint64 `json:"corrected"`
|
||||||
|
Uncorrected uint64 `json:"uncorrected"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
|
||||||
|
func (e BenchmarkECCCounters) IsZero() bool { return e.Corrected == 0 && e.Uncorrected == 0 }
|
||||||
|
|
||||||
|
type BenchmarkPrecisionResult struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Category string `json:"category"`
|
||||||
|
Supported bool `json:"supported"`
|
||||||
|
Lanes int `json:"lanes,omitempty"`
|
||||||
|
M uint64 `json:"m,omitempty"`
|
||||||
|
N uint64 `json:"n,omitempty"`
|
||||||
|
K uint64 `json:"k,omitempty"`
|
||||||
|
Iterations uint64 `json:"iterations,omitempty"`
|
||||||
|
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||||
|
// Weight is the fp32-equivalence factor for this precision category.
|
||||||
|
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||||
|
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||||
|
Weight float64 `json:"weight,omitempty"`
|
||||||
|
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||||
|
Notes string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkScorecard struct {
|
||||||
|
ComputeScore float64 `json:"compute_score"`
|
||||||
|
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
|
||||||
|
// steady phases (each precision ran alone, full GPU dedicated).
|
||||||
|
SyntheticScore float64 `json:"synthetic_score,omitempty"`
|
||||||
|
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
|
||||||
|
// (all precisions competing simultaneously — closer to real workloads).
|
||||||
|
MixedScore float64 `json:"mixed_score,omitempty"`
|
||||||
|
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
|
||||||
|
// sustains throughput under concurrent mixed-precision load.
|
||||||
|
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||||
|
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||||
|
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||||
|
// StabilityScore: fraction of steady-state time the GPU spent throttling
|
||||||
|
// (thermal + power cap combined). 0% throttle = 100; 100% throttle = 0.
|
||||||
|
StabilityScore float64 `json:"stability_score"`
|
||||||
|
|
||||||
|
// Throttle breakdown — percentage of steady-state time in each throttle type.
|
||||||
|
// Used for diagnosis: tells WHY the GPU throttled, not just whether it did.
|
||||||
|
ThermalThrottlePct float64 `json:"thermal_throttle_pct"` // HW+SW thermal slowdown
|
||||||
|
PowerCapThrottlePct float64 `json:"power_cap_throttle_pct"` // SW power cap
|
||||||
|
SyncBoostThrottlePct float64 `json:"sync_boost_throttle_pct,omitempty"`
|
||||||
|
|
||||||
|
// Temperature headroom: distance to the 100°C destruction threshold.
|
||||||
|
// TempHeadroomC = 100 - P95TempC. < 20°C = warning; < 10°C = critical.
|
||||||
|
// Independent of throttle — a GPU at 86°C without throttle is still in the red zone.
|
||||||
|
TempHeadroomC float64 `json:"temp_headroom_c"`
|
||||||
|
|
||||||
|
InterconnectScore float64 `json:"interconnect_score"`
|
||||||
|
// ServerQualityScore (0–100) reflects server infrastructure quality independent
|
||||||
|
// of GPU model. Combines throttle time, power variance, and temp variance.
|
||||||
|
// Use this to compare servers with the same GPU, or to flag a bad server
|
||||||
|
// that throttles an otherwise fast GPU.
|
||||||
|
ServerQualityScore float64 `json:"server_quality_score"`
|
||||||
|
// CompositeScore is the raw compute score (TOPS, fp32-equivalent).
|
||||||
|
// A throttling GPU will score lower here automatically — no quality multiplier.
|
||||||
|
CompositeScore float64 `json:"composite_score"`
|
||||||
|
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||||
|
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkPSUSlotPower holds SDR power readings for one PSU slot sampled
|
||||||
|
// during the benchmark. Slot keys match audit HardwarePowerSupply.Slot (0-based)
|
||||||
|
// so benchmark and audit data can be correlated by slot.
|
||||||
|
type BenchmarkPSUSlotPower struct {
|
||||||
|
InputW *float64 `json:"input_w,omitempty"` // AC wall input (PSUx_POWER_IN)
|
||||||
|
OutputW *float64 `json:"output_w,omitempty"` // DC output (PSUx_POWER_OUT)
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkServerPower captures server-side power from multiple independent
|
||||||
|
// sources: IPMI DCMI (high-level), IPMI SDR per-PSU sensors (granular), and
|
||||||
|
// GPU-reported power (nvidia-smi). Cross-comparing sources detects when DCMI
|
||||||
|
// covers only a subset of installed PSUs (partial coverage).
|
||||||
|
//
|
||||||
|
// Source legend:
|
||||||
|
// - DCMI — `ipmitool dcmi power reading`; fast but may miss PSUs
|
||||||
|
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||||
|
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||||
|
type BenchmarkServerPower struct {
|
||||||
|
Available bool `json:"available"`
|
||||||
|
Source string `json:"source,omitempty"`
|
||||||
|
Mode string `json:"mode,omitempty"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||||
|
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||||
|
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||||
|
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||||
|
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||||
|
|
||||||
|
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||||
|
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||||
|
PSUInputIdleW float64 `json:"psu_input_idle_w,omitempty"`
|
||||||
|
PSUInputLoadedW float64 `json:"psu_input_loaded_w,omitempty"`
|
||||||
|
|
||||||
|
// PSU DC output sum — power delivered to server internals after conversion.
|
||||||
|
PSUOutputIdleW float64 `json:"psu_output_idle_w,omitempty"`
|
||||||
|
PSUOutputLoadedW float64 `json:"psu_output_loaded_w,omitempty"`
|
||||||
|
|
||||||
|
// Per-slot PSU readings at idle and at peak load.
|
||||||
|
// Keys are 0-based slot strings matching audit HardwarePowerSupply.Slot.
|
||||||
|
PSUSlotReadingsIdle map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_idle,omitempty"`
|
||||||
|
PSUSlotReadingsLoaded map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings_loaded,omitempty"`
|
||||||
|
|
||||||
|
// GPUSlotTotalW is the sum of GPU_POWER_SLOTx SDR sensors at peak load.
|
||||||
|
// PCIe slot delivery only (excludes 16-pin connector power).
|
||||||
|
GPUSlotTotalW float64 `json:"gpu_slot_total_w,omitempty"`
|
||||||
|
|
||||||
|
// DCMICoverageRatio = DCMI_idle / SDR_PSU_IN_idle.
|
||||||
|
// Near 1.0 → DCMI tracks all PSUs. Near 0.5 → DCMI tracks half the PSUs.
|
||||||
|
DCMICoverageRatio float64 `json:"dcmi_coverage_ratio,omitempty"`
|
||||||
|
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||||
|
// during a dedicated single-precision steady window. Because only one kernel
|
||||||
|
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||||
|
type BenchmarkPrecisionSteadyPhase struct {
|
||||||
|
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
|
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||||
|
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||||
|
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||||
|
// ECC errors accumulated during this precision phase only.
|
||||||
|
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||||
|
// Any uncorrected = serious fault triggered by this precision workload.
|
||||||
|
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||||
|
Notes string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkInterconnectResult struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Attempted bool `json:"attempted"`
|
||||||
|
Supported bool `json:"supported"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
|
||||||
|
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
|
||||||
|
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
|
||||||
|
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
|
||||||
|
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchResult struct {
|
||||||
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
|
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||||
|
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||||
|
OverallStatus string `json:"overall_status"`
|
||||||
|
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
|
||||||
|
// cumulative thermal ramp. Represents the actual sustained power budget of
|
||||||
|
// this server under full GPU load. Use for rack power planning.
|
||||||
|
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
|
||||||
|
// ServerPower captures IPMI server power delta (idle→loaded) measured in
|
||||||
|
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
|
||||||
|
// actual wall-power draw as seen by the server's power supply.
|
||||||
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
|
// PSUIssues holds power supply fault events detected by comparing IPMI PSU
|
||||||
|
// sensor states before and after the power benchmark run. Empty when IPMI is
|
||||||
|
// unavailable or no PSU faults occurred during the test.
|
||||||
|
PSUIssues []string `json:"psu_issues,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchGPU struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
BusID string `json:"bus_id,omitempty"`
|
||||||
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
// AppliedPowerLimitW is the stable limit found during single-card calibration.
|
||||||
|
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||||
|
// StablePowerLimitW is the final fixed limit for this GPU after the
|
||||||
|
// cumulative thermal ramp. This is the limit at which the GPU operated
|
||||||
|
// stably with all other GPUs running simultaneously at their own limits.
|
||||||
|
// May be lower than AppliedPowerLimitW if multi-GPU thermal load required
|
||||||
|
// additional derating.
|
||||||
|
StablePowerLimitW float64 `json:"stable_power_limit_w,omitempty"`
|
||||||
|
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||||
|
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||||
|
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||||
|
Derated bool `json:"derated,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||||
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
|
// ServerLoadedW is the IPMI server power reading captured during this
|
||||||
|
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
|
// Telemetry holds the aggregated stats from the final converged calibration
|
||||||
|
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||||
|
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||||
|
// Fan state sampled at the end of single-card calibration.
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchStep struct {
|
||||||
|
StepIndex int `json:"step_index"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
// NewGPUIndex is the GPU whose stable limit was searched in this step.
|
||||||
|
NewGPUIndex int `json:"new_gpu_index"`
|
||||||
|
// NewGPUStableLimitW is the stable power limit found for the new GPU.
|
||||||
|
NewGPUStableLimitW float64 `json:"new_gpu_stable_limit_w,omitempty"`
|
||||||
|
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||||
|
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||||
|
Derated bool `json:"derated,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// ServerLoadedW is the IPMI server power reading captured during this
|
||||||
|
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
|
// PSU slot readings sampled at end of this ramp step.
|
||||||
|
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||||
|
// Fan state at end of this ramp step.
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
|
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||||
|
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
|
// scalability ramp-up phase of the performance benchmark.
|
||||||
|
type NvidiaPerformanceRampStep struct {
|
||||||
|
StepIndex int `json:"step_index"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
// TotalSyntheticTOPS is the sum of per-GPU SyntheticScore (fp32-equivalent
|
||||||
|
// TOPS from dedicated single-precision phases) across all GPUs in this step.
|
||||||
|
TotalSyntheticTOPS float64 `json:"total_synthetic_tops"`
|
||||||
|
TotalMixedTOPS float64 `json:"total_mixed_tops,omitempty"`
|
||||||
|
// ScalabilityPct = TotalSyntheticTOPS / (k × best_single_gpu_tops) × 100.
|
||||||
|
// 100% = perfect linear scaling. < 100% = thermal/power/interconnect loss.
|
||||||
|
ScalabilityPct float64 `json:"scalability_pct"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
@@ -13,19 +13,27 @@ import (
|
|||||||
|
|
||||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||||
type GPUMetricRow struct {
|
type GPUMetricRow struct {
|
||||||
ElapsedSec float64 `json:"elapsed_sec"`
|
Stage string `json:"stage,omitempty"`
|
||||||
GPUIndex int `json:"index"`
|
StageStartSec float64 `json:"stage_start_sec,omitempty"`
|
||||||
TempC float64 `json:"temp_c"`
|
StageEndSec float64 `json:"stage_end_sec,omitempty"`
|
||||||
UsagePct float64 `json:"usage_pct"`
|
ElapsedSec float64 `json:"elapsed_sec"`
|
||||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
GPUIndex int `json:"index"`
|
||||||
PowerW float64 `json:"power_w"`
|
TempC float64 `json:"temp_c"`
|
||||||
ClockMHz float64 `json:"clock_mhz"`
|
UsagePct float64 `json:"usage_pct"`
|
||||||
|
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
|
ClockMHz float64 `json:"clock_mhz"`
|
||||||
|
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||||
|
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||||
|
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||||
|
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||||
|
FanDutyCycleEstimated bool `json:"fan_duty_cycle_estimated,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
args := []string{
|
args := []string{
|
||||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
|
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
}
|
}
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
@@ -46,7 +54,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
parts := strings.Split(line, ", ")
|
parts := strings.Split(line, ", ")
|
||||||
if len(parts) < 6 {
|
if len(parts) < 7 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
@@ -57,6 +65,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
MemUsagePct: parseGPUFloat(parts[3]),
|
MemUsagePct: parseGPUFloat(parts[3]),
|
||||||
PowerW: parseGPUFloat(parts[4]),
|
PowerW: parseGPUFloat(parts[4]),
|
||||||
ClockMHz: parseGPUFloat(parts[5]),
|
ClockMHz: parseGPUFloat(parts[5]),
|
||||||
|
MemClockMHz: parseGPUFloat(parts[6]),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return rows, nil
|
return rows, nil
|
||||||
@@ -139,14 +148,28 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
|||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available,fan_duty_cycle_estimated\n")
|
||||||
for _, r := range rows {
|
for _, r := range rows {
|
||||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
dutyAvail := 0
|
||||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
if r.FanDutyCycleAvailable {
|
||||||
|
dutyAvail = 1
|
||||||
|
}
|
||||||
|
dutyEstimated := 0
|
||||||
|
if r.FanDutyCycleEstimated {
|
||||||
|
dutyEstimated = 1
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d,%d\n",
|
||||||
|
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail, dutyEstimated)
|
||||||
}
|
}
|
||||||
return os.WriteFile(path, b.Bytes(), 0644)
|
return os.WriteFile(path, b.Bytes(), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type gpuMetricStageSpan struct {
|
||||||
|
Name string
|
||||||
|
Start float64
|
||||||
|
End float64
|
||||||
|
}
|
||||||
|
|
||||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||||
// Group by GPU index preserving order.
|
// Group by GPU index preserving order.
|
||||||
@@ -161,9 +184,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
|||||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stageSpans := buildGPUMetricStageSpans(rows)
|
||||||
|
stageColorByName := make(map[string]string, len(stageSpans))
|
||||||
|
for i, span := range stageSpans {
|
||||||
|
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
|
||||||
|
}
|
||||||
|
|
||||||
|
var legend strings.Builder
|
||||||
|
if len(stageSpans) > 0 {
|
||||||
|
legend.WriteString(`<div class="stage-legend">`)
|
||||||
|
for _, span := range stageSpans {
|
||||||
|
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
|
||||||
|
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
|
||||||
|
}
|
||||||
|
legend.WriteString(`</div>`)
|
||||||
|
}
|
||||||
|
|
||||||
var svgs strings.Builder
|
var svgs strings.Builder
|
||||||
for _, gpuIdx := range order {
|
for _, gpuIdx := range order {
|
||||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
|
||||||
svgs.WriteString("\n")
|
svgs.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -173,21 +212,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
|||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<title>GPU Stress Test Metrics</title>
|
<title>GPU Stress Test Metrics</title>
|
||||||
<style>
|
<style>
|
||||||
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
|
||||||
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
*{box-sizing:border-box}
|
||||||
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
|
||||||
|
.page{padding:24px}
|
||||||
|
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
|
||||||
|
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
|
||||||
|
.card-body{padding:16px}
|
||||||
|
h1{font-size:22px;margin:0 0 6px}
|
||||||
|
p{color:var(--muted);font-size:13px;margin:0 0 16px}
|
||||||
|
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
|
||||||
|
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
|
||||||
|
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
|
||||||
|
.chart-block{margin-top:16px}
|
||||||
</style>
|
</style>
|
||||||
</head><body>
|
</head><body>
|
||||||
|
<div class="page">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">GPU Stress Test Metrics</div>
|
||||||
|
<div class="card-body">
|
||||||
<h1>GPU Stress Test Metrics</h1>
|
<h1>GPU Stress Test Metrics</h1>
|
||||||
<p>Generated %s</p>
|
<p>Generated %s</p>
|
||||||
%s
|
%s
|
||||||
</body></html>`, ts, svgs.String())
|
<div class="chart-block">%s</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body></html>`, ts, legend.String(), svgs.String())
|
||||||
|
|
||||||
return os.WriteFile(path, []byte(html), 0644)
|
return os.WriteFile(path, []byte(html), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
|
||||||
// Layout
|
// Layout
|
||||||
const W, H = 960, 520
|
const W, H = 960, 520
|
||||||
const plotX1 = 120 // usage axis / chart left border
|
const plotX1 = 120 // usage axis / chart left border
|
||||||
@@ -197,7 +254,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
const PW = plotX2 - plotX1
|
const PW = plotX2 - plotX1
|
||||||
const PH = plotY2 - plotY1
|
const PH = plotY2 - plotY1
|
||||||
// Outer axes
|
// Outer axes
|
||||||
const tempAxisX = 60 // temp axis line
|
const tempAxisX = 60 // temp axis line
|
||||||
const clockAxisX = 900 // clock axis line
|
const clockAxisX = 900 // clock axis line
|
||||||
|
|
||||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||||
@@ -282,6 +339,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
}
|
}
|
||||||
b.WriteString("</g>\n")
|
b.WriteString("</g>\n")
|
||||||
|
|
||||||
|
// Stage backgrounds
|
||||||
|
for _, span := range stageSpans {
|
||||||
|
x1 := xv(span.Start)
|
||||||
|
x2 := xv(span.End)
|
||||||
|
if x2 < x1 {
|
||||||
|
x1, x2 = x2, x1
|
||||||
|
}
|
||||||
|
if x2-x1 < 1 {
|
||||||
|
x2 = x1 + 1
|
||||||
|
}
|
||||||
|
color := stageColorByName[span.Name]
|
||||||
|
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
|
||||||
|
x1, plotY1, x2-x1, PH, color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
|
||||||
|
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
|
||||||
|
}
|
||||||
|
|
||||||
// Chart border
|
// Chart border
|
||||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||||
@@ -380,224 +454,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
|
||||||
ansiRed = "\033[31m"
|
|
||||||
ansiBlue = "\033[34m"
|
|
||||||
ansiGreen = "\033[32m"
|
|
||||||
ansiYellow = "\033[33m"
|
|
||||||
ansiReset = "\033[0m"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
termChartWidth = 70
|
|
||||||
termChartHeight = 12
|
|
||||||
)
|
|
||||||
|
|
||||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
|
||||||
// Used in SAT stress-test logs.
|
|
||||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
|
||||||
seen := make(map[int]bool)
|
|
||||||
var order []int
|
|
||||||
gpuMap := make(map[int][]GPUMetricRow)
|
|
||||||
for _, r := range rows {
|
|
||||||
if !seen[r.GPUIndex] {
|
|
||||||
seen[r.GPUIndex] = true
|
|
||||||
order = append(order, r.GPUIndex)
|
|
||||||
}
|
|
||||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
|
||||||
}
|
|
||||||
|
|
||||||
type seriesDef struct {
|
|
||||||
caption string
|
|
||||||
color string
|
|
||||||
fn func(GPUMetricRow) float64
|
|
||||||
}
|
|
||||||
defs := []seriesDef{
|
|
||||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
|
||||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
|
||||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
|
||||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
|
||||||
}
|
|
||||||
|
|
||||||
var b strings.Builder
|
|
||||||
for _, gpuIdx := range order {
|
|
||||||
gr := gpuMap[gpuIdx]
|
|
||||||
if len(gr) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
|
||||||
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
|
||||||
for _, d := range defs {
|
|
||||||
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
|
||||||
termChartHeight, termChartWidth))
|
|
||||||
b.WriteRune('\n')
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.TrimRight(b.String(), "\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
|
||||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
|
||||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
|
||||||
if len(vals) == 0 {
|
|
||||||
return caption + "\n"
|
|
||||||
}
|
|
||||||
|
|
||||||
mn, mx := gpuMinMax(vals)
|
|
||||||
if mn == mx {
|
|
||||||
mx = mn + 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
|
||||||
w := width
|
|
||||||
if len(vals) < w {
|
|
||||||
w = len(vals)
|
|
||||||
}
|
|
||||||
data := gpuDownsample(vals, w)
|
|
||||||
|
|
||||||
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
|
||||||
row := make([]int, w)
|
|
||||||
for i, v := range data {
|
|
||||||
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
|
||||||
if r < 0 {
|
|
||||||
r = 0
|
|
||||||
}
|
|
||||||
if r > height {
|
|
||||||
r = height
|
|
||||||
}
|
|
||||||
row[i] = r
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fill the character grid.
|
|
||||||
grid := make([][]rune, height+1)
|
|
||||||
for i := range grid {
|
|
||||||
grid[i] = make([]rune, w)
|
|
||||||
for j := range grid[i] {
|
|
||||||
grid[i][j] = ' '
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for x := 0; x < w; x++ {
|
|
||||||
r := row[x]
|
|
||||||
if x == 0 {
|
|
||||||
grid[r][0] = '─'
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
p := row[x-1]
|
|
||||||
switch {
|
|
||||||
case r == p:
|
|
||||||
grid[r][x] = '─'
|
|
||||||
case r < p: // value went up (row index decreased toward top)
|
|
||||||
grid[r][x] = '╭'
|
|
||||||
grid[p][x] = '╯'
|
|
||||||
for y := r + 1; y < p; y++ {
|
|
||||||
grid[y][x] = '│'
|
|
||||||
}
|
|
||||||
default: // r > p, value went down
|
|
||||||
grid[p][x] = '╮'
|
|
||||||
grid[r][x] = '╰'
|
|
||||||
for y := p + 1; y < r; y++ {
|
|
||||||
grid[y][x] = '│'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Y axis tick labels.
|
|
||||||
ticks := gpuNiceTicks(mn, mx, height/2)
|
|
||||||
tickAtRow := make(map[int]string)
|
|
||||||
labelWidth := 4
|
|
||||||
for _, t := range ticks {
|
|
||||||
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
|
||||||
if r < 0 || r > height {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
s := gpuFormatTick(t)
|
|
||||||
tickAtRow[r] = s
|
|
||||||
if len(s) > labelWidth {
|
|
||||||
labelWidth = len(s)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var b strings.Builder
|
|
||||||
for r := 0; r <= height; r++ {
|
|
||||||
label := tickAtRow[r]
|
|
||||||
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
|
||||||
switch {
|
|
||||||
case label != "":
|
|
||||||
b.WriteRune('┤')
|
|
||||||
case r == height:
|
|
||||||
b.WriteRune('┼')
|
|
||||||
default:
|
|
||||||
b.WriteRune('│')
|
|
||||||
}
|
|
||||||
b.WriteString(color)
|
|
||||||
b.WriteString(string(grid[r]))
|
|
||||||
b.WriteString(ansiReset)
|
|
||||||
b.WriteRune('\n')
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bottom axis.
|
|
||||||
b.WriteString(strings.Repeat(" ", labelWidth))
|
|
||||||
b.WriteRune('└')
|
|
||||||
b.WriteString(strings.Repeat("─", w))
|
|
||||||
b.WriteRune('\n')
|
|
||||||
|
|
||||||
// Caption centered under the chart.
|
|
||||||
if caption != "" {
|
|
||||||
total := labelWidth + 1 + w
|
|
||||||
if pad := (total - len(caption)) / 2; pad > 0 {
|
|
||||||
b.WriteString(strings.Repeat(" ", pad))
|
|
||||||
}
|
|
||||||
b.WriteString(caption)
|
|
||||||
b.WriteRune('\n')
|
|
||||||
}
|
|
||||||
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
|
||||||
v := make([]float64, len(rows))
|
|
||||||
for i, r := range rows {
|
|
||||||
v[i] = fn(r)
|
|
||||||
}
|
|
||||||
return v
|
|
||||||
}
|
|
||||||
|
|
||||||
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
|
||||||
func gpuDownsample(vals []float64, w int) []float64 {
|
|
||||||
n := len(vals)
|
|
||||||
if n == 0 {
|
|
||||||
return make([]float64, w)
|
|
||||||
}
|
|
||||||
result := make([]float64, w)
|
|
||||||
if n >= w {
|
|
||||||
counts := make([]int, w)
|
|
||||||
for i, v := range vals {
|
|
||||||
bucket := i * w / n
|
|
||||||
if bucket >= w {
|
|
||||||
bucket = w - 1
|
|
||||||
}
|
|
||||||
result[bucket] += v
|
|
||||||
counts[bucket]++
|
|
||||||
}
|
|
||||||
for i := range result {
|
|
||||||
if counts[i] > 0 {
|
|
||||||
result[i] /= float64(counts[i])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Nearest-neighbour upsample.
|
|
||||||
for i := range result {
|
|
||||||
src := i * (n - 1) / (w - 1)
|
|
||||||
if src >= n {
|
|
||||||
src = n - 1
|
|
||||||
}
|
|
||||||
result[i] = vals[src]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
func gpuMinMax(vals []float64) (float64, float64) {
|
func gpuMinMax(vals []float64) (float64, float64) {
|
||||||
if len(vals) == 0 {
|
if len(vals) == 0 {
|
||||||
return 0, 1
|
return 0, 1
|
||||||
@@ -642,3 +498,57 @@ func gpuFormatTick(v float64) string {
|
|||||||
}
|
}
|
||||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var gpuMetricStagePalette = []string{
|
||||||
|
"#d95c5c",
|
||||||
|
"#2185d0",
|
||||||
|
"#21ba45",
|
||||||
|
"#f2c037",
|
||||||
|
"#6435c9",
|
||||||
|
"#00b5ad",
|
||||||
|
"#a5673f",
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||||
|
var spans []gpuMetricStageSpan
|
||||||
|
for _, row := range rows {
|
||||||
|
name := strings.TrimSpace(row.Stage)
|
||||||
|
if name == "" {
|
||||||
|
name = "run"
|
||||||
|
}
|
||||||
|
start := row.StageStartSec
|
||||||
|
end := row.StageEndSec
|
||||||
|
if end <= start {
|
||||||
|
start = row.ElapsedSec
|
||||||
|
end = row.ElapsedSec
|
||||||
|
}
|
||||||
|
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||||
|
spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if start < spans[len(spans)-1].Start {
|
||||||
|
spans[len(spans)-1].Start = start
|
||||||
|
}
|
||||||
|
if end > spans[len(spans)-1].End {
|
||||||
|
spans[len(spans)-1].End = end
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i := range spans {
|
||||||
|
if spans[i].End <= spans[i].Start {
|
||||||
|
spans[i].End = spans[i].Start + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return spans
|
||||||
|
}
|
||||||
|
|
||||||
|
var gpuHTMLReplacer = strings.NewReplacer(
|
||||||
|
"&", "&",
|
||||||
|
"<", "<",
|
||||||
|
">", ">",
|
||||||
|
`"`, """,
|
||||||
|
"'", "'",
|
||||||
|
)
|
||||||
|
|
||||||
|
func gpuHTMLEscape(s string) string {
|
||||||
|
return gpuHTMLReplacer.Replace(s)
|
||||||
|
}
|
||||||
|
|||||||
65
audit/internal/platform/gpu_metrics_test.go
Normal file
65
audit/internal/platform/gpu_metrics_test.go
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "gpu-metrics.csv")
|
||||||
|
rows := []GPUMetricRow{
|
||||||
|
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
|
||||||
|
}
|
||||||
|
if err := WriteGPUMetricsCSV(path, rows); err != nil {
|
||||||
|
t.Fatalf("WriteGPUMetricsCSV: %v", err)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile: %v", err)
|
||||||
|
}
|
||||||
|
text := string(raw)
|
||||||
|
for _, needle := range []string{
|
||||||
|
"stage,elapsed_sec,gpu_index",
|
||||||
|
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(text, needle) {
|
||||||
|
t.Fatalf("csv missing %q\n%s", needle, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "gpu-metrics.html")
|
||||||
|
rows := []GPUMetricRow{
|
||||||
|
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
|
||||||
|
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
|
||||||
|
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
|
||||||
|
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
|
||||||
|
}
|
||||||
|
if err := WriteGPUMetricsHTML(path, rows); err != nil {
|
||||||
|
t.Fatalf("WriteGPUMetricsHTML: %v", err)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile: %v", err)
|
||||||
|
}
|
||||||
|
text := string(raw)
|
||||||
|
for _, needle := range []string{
|
||||||
|
"stage-legend",
|
||||||
|
"baseline",
|
||||||
|
"steady-fp16",
|
||||||
|
"GPU Stress Test Metrics",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(text, needle) {
|
||||||
|
t.Fatalf("html missing %q\n%s", needle, text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,12 +11,11 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const installToRAMDir = "/dev/shm/bee-live"
|
||||||
|
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||||
|
|
||||||
func (s *System) IsLiveMediaInRAM() bool {
|
func (s *System) IsLiveMediaInRAM() bool {
|
||||||
fsType := mountFSType("/run/live/medium")
|
return s.LiveMediaRAMState().InRAM
|
||||||
if fsType == "" {
|
|
||||||
return toramActive()
|
|
||||||
}
|
|
||||||
return strings.EqualFold(fsType, "tmpfs")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) LiveBootSource() LiveBootSource {
|
func (s *System) LiveBootSource() LiveBootSource {
|
||||||
@@ -48,42 +47,164 @@ func (s *System) LiveBootSource() LiveBootSource {
|
|||||||
return status
|
return status
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
func (s *System) LiveMediaRAMState() LiveMediaRAMState {
|
||||||
|
return evaluateLiveMediaRAMState(
|
||||||
|
s.LiveBootSource(),
|
||||||
|
toramActive(),
|
||||||
|
globPaths("/run/live/medium/live/*.squashfs"),
|
||||||
|
globPaths(filepath.Join(installToRAMDir, "*.squashfs")),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func evaluateLiveMediaRAMState(status LiveBootSource, toram bool, sourceSquashfs, copiedSquashfs []string) LiveMediaRAMState {
|
||||||
|
state := LiveMediaRAMState{
|
||||||
|
LiveBootSource: status,
|
||||||
|
ToramActive: toram,
|
||||||
|
CopyPresent: len(copiedSquashfs) > 0,
|
||||||
|
}
|
||||||
|
if status.InRAM {
|
||||||
|
state.State = "in_ram"
|
||||||
|
state.Status = "ok"
|
||||||
|
state.CopyComplete = true
|
||||||
|
state.Message = "Running from RAM — installation media can be safely disconnected."
|
||||||
|
return state
|
||||||
|
}
|
||||||
|
|
||||||
|
expected := pathBaseSet(sourceSquashfs)
|
||||||
|
copied := pathBaseSet(copiedSquashfs)
|
||||||
|
state.CopyComplete = len(expected) > 0 && setContainsAll(copied, expected)
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case state.CopyComplete:
|
||||||
|
state.State = "partial"
|
||||||
|
state.Status = "partial"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "Live media files were copied to RAM, but the system is still mounted from the original boot source."
|
||||||
|
case state.CopyPresent:
|
||||||
|
state.State = "partial"
|
||||||
|
state.Status = "partial"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "Partial RAM copy detected. A previous Copy to RAM run was interrupted or cancelled."
|
||||||
|
case toram:
|
||||||
|
state.State = "toram_failed"
|
||||||
|
state.Status = "failed"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "toram boot parameter is set but the live medium is not mounted from RAM."
|
||||||
|
default:
|
||||||
|
state.State = "not_in_ram"
|
||||||
|
state.Status = "warning"
|
||||||
|
state.CanStartCopy = true
|
||||||
|
state.Message = "ISO not copied to RAM. Use Copy to RAM to free the boot drive and improve performance."
|
||||||
|
}
|
||||||
|
return state
|
||||||
|
}
|
||||||
|
|
||||||
|
func globPaths(pattern string) []string {
|
||||||
|
matches, _ := filepath.Glob(pattern)
|
||||||
|
return matches
|
||||||
|
}
|
||||||
|
|
||||||
|
func pathBaseSet(paths []string) map[string]struct{} {
|
||||||
|
out := make(map[string]struct{}, len(paths))
|
||||||
|
for _, path := range paths {
|
||||||
|
base := strings.TrimSpace(filepath.Base(path))
|
||||||
|
if base != "" {
|
||||||
|
out[base] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func setContainsAll(have, want map[string]struct{}) bool {
|
||||||
|
if len(want) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for name := range want {
|
||||||
|
if _, ok := have[name]; !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (retErr error) {
|
||||||
log := func(msg string) {
|
log := func(msg string) {
|
||||||
if logFunc != nil {
|
if logFunc != nil {
|
||||||
logFunc(msg)
|
logFunc(msg)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.IsLiveMediaInRAM() {
|
state := s.LiveMediaRAMState()
|
||||||
|
if state.InRAM {
|
||||||
log("Already running from RAM — installation media can be safely disconnected.")
|
log("Already running from RAM — installation media can be safely disconnected.")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||||
if err != nil || len(squashfsFiles) == 0 {
|
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/")
|
|
||||||
}
|
|
||||||
|
|
||||||
free := freeMemBytes()
|
dstDir := installToRAMDir
|
||||||
var needed int64
|
|
||||||
for _, sf := range squashfsFiles {
|
// If the source medium is unavailable, check whether a previous run already
|
||||||
fi, err2 := os.Stat(sf)
|
// produced a complete copy in RAM. If so, skip the copy phase and proceed
|
||||||
if err2 != nil {
|
// directly to the loop-rebind / bind-mount steps.
|
||||||
return fmt.Errorf("stat %s: %v", sf, err2)
|
if !sourceAvailable {
|
||||||
|
copiedFiles, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||||
|
if len(copiedFiles) > 0 {
|
||||||
|
log("Source medium not available, but a previous RAM copy was found — resuming from existing copy.")
|
||||||
|
// Proceed to rebind with the already-copied files.
|
||||||
|
for _, dst := range copiedFiles {
|
||||||
|
base := filepath.Base(dst)
|
||||||
|
// Re-associate the loop device that was originally backed by the
|
||||||
|
// source file (now gone); find it by the old source path pattern.
|
||||||
|
srcGuess := "/run/live/medium/live/" + base
|
||||||
|
loopDev, lerr := findLoopForFile(srcGuess)
|
||||||
|
if lerr != nil {
|
||||||
|
log(fmt.Sprintf("Loop device for %s not found (%v) — skipping re-association.", base, lerr))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if rerr := reassociateLoopDevice(loopDev, dst); rerr != nil {
|
||||||
|
log(fmt.Sprintf("Warning: could not re-associate %s → %s: %v", loopDev, dst, rerr))
|
||||||
|
} else {
|
||||||
|
log(fmt.Sprintf("Loop device %s now backed by RAM copy.", loopDev))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto bindMedium
|
||||||
}
|
}
|
||||||
needed += fi.Size()
|
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||||
}
|
|
||||||
const headroom = 256 * 1024 * 1024
|
|
||||||
if free > 0 && needed+headroom > free {
|
|
||||||
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
|
||||||
humanBytes(needed+headroom), humanBytes(free))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dstDir := "/dev/shm/bee-live"
|
{
|
||||||
|
free := freeMemBytes()
|
||||||
|
var needed int64
|
||||||
|
for _, sf := range squashfsFiles {
|
||||||
|
fi, err2 := os.Stat(sf)
|
||||||
|
if err2 != nil {
|
||||||
|
return fmt.Errorf("stat %s: %v", sf, err2)
|
||||||
|
}
|
||||||
|
needed += fi.Size()
|
||||||
|
}
|
||||||
|
const headroom = 256 * 1024 * 1024
|
||||||
|
if free > 0 && needed+headroom > free {
|
||||||
|
return fmt.Errorf("insufficient RAM: need %s, available %s",
|
||||||
|
humanBytes(needed+headroom), humanBytes(free))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if state.CopyPresent {
|
||||||
|
log("Removing stale partial RAM copy before retry...")
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(dstDir)
|
||||||
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
||||||
return fmt.Errorf("create tmpfs dir: %v", err)
|
return fmt.Errorf("create tmpfs dir: %v", err)
|
||||||
}
|
}
|
||||||
|
defer func() {
|
||||||
|
if retErr == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(dstDir)
|
||||||
|
log("Removed incomplete RAM copy.")
|
||||||
|
}()
|
||||||
|
|
||||||
for _, sf := range squashfsFiles {
|
for _, sf := range squashfsFiles {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
@@ -109,6 +230,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bindMedium:
|
||||||
log("Copying remaining medium files...")
|
log("Copying remaining medium files...")
|
||||||
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
if err := cpDir(ctx, "/run/live/medium", dstDir, log); err != nil {
|
||||||
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
log(fmt.Sprintf("Warning: partial copy: %v", err))
|
||||||
@@ -116,14 +238,71 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
|||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
|
||||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
mediumRebound := false
|
||||||
|
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
|
||||||
|
} else {
|
||||||
|
mediumRebound = true
|
||||||
}
|
}
|
||||||
|
|
||||||
log("Done. Installation media can be safely disconnected.")
|
log("Verifying live medium now served from RAM...")
|
||||||
|
status := s.LiveBootSource()
|
||||||
|
if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if status.InRAM {
|
||||||
|
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||||
|
}
|
||||||
|
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||||
|
if status.InRAM {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// The live medium mount was not redirected to RAM. This is expected when
|
||||||
|
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
|
||||||
|
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
|
||||||
|
// because the CD-ROM mount is in use. Check whether files were at least
|
||||||
|
// copied to the tmpfs directory — that is sufficient for safe disconnection
|
||||||
|
// once the kernel has paged in all actively-used data.
|
||||||
|
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||||
|
if len(files) > 0 {
|
||||||
|
if !mediumRebound {
|
||||||
|
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
|
||||||
|
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func describeLiveBootSource(status LiveBootSource) string {
|
||||||
|
source := strings.TrimSpace(status.Device)
|
||||||
|
if source == "" {
|
||||||
|
source = strings.TrimSpace(status.Source)
|
||||||
|
}
|
||||||
|
if source == "" {
|
||||||
|
source = "unknown source"
|
||||||
|
}
|
||||||
|
switch strings.TrimSpace(status.Kind) {
|
||||||
|
case "ram":
|
||||||
|
return "RAM"
|
||||||
|
case "usb":
|
||||||
|
return "USB (" + source + ")"
|
||||||
|
case "cdrom":
|
||||||
|
return "CD-ROM (" + source + ")"
|
||||||
|
case "disk":
|
||||||
|
return "disk (" + source + ")"
|
||||||
|
default:
|
||||||
|
return source
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
in, err := os.Open(src)
|
in, err := os.Open(src)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -141,6 +320,7 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
|||||||
defer out.Close()
|
defer out.Close()
|
||||||
total := fi.Size()
|
total := fi.Size()
|
||||||
var copied int64
|
var copied int64
|
||||||
|
var lastLogged int64
|
||||||
buf := make([]byte, 4*1024*1024)
|
buf := make([]byte, 4*1024*1024)
|
||||||
for {
|
for {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
@@ -152,7 +332,8 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
|||||||
return werr
|
return werr
|
||||||
}
|
}
|
||||||
copied += int64(n)
|
copied += int64(n)
|
||||||
if logFunc != nil && total > 0 {
|
if shouldLogCopyProgress(copied, total, lastLogged) {
|
||||||
|
lastLogged = copied
|
||||||
pct := int(float64(copied) / float64(total) * 100)
|
pct := int(float64(copied) / float64(total) * 100)
|
||||||
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
logFunc(fmt.Sprintf(" %s / %s (%d%%)", humanBytes(copied), humanBytes(total), pct))
|
||||||
}
|
}
|
||||||
@@ -167,6 +348,19 @@ func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) e
|
|||||||
return out.Sync()
|
return out.Sync()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldLogCopyProgress(copied, total, lastLogged int64) bool {
|
||||||
|
if total <= 0 || copied <= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if copied >= total {
|
||||||
|
return copied > lastLogged
|
||||||
|
}
|
||||||
|
if copied < copyProgressLogStep {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return copied-lastLogged >= copyProgressLogStep
|
||||||
|
}
|
||||||
|
|
||||||
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
func cpDir(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
return filepath.Walk(src, func(path string, fi os.FileInfo, err error) error {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
@@ -212,7 +406,31 @@ func findLoopForFile(backingFile string) (string, error) {
|
|||||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loopDeviceOffset returns the byte offset configured for the loop device,
|
||||||
|
// or -1 if it cannot be determined.
|
||||||
|
func loopDeviceOffset(loopDev string) int64 {
|
||||||
|
out, err := exec.Command("losetup", "--json", loopDev).Output()
|
||||||
|
if err != nil {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
var result struct {
|
||||||
|
Loopdevices []struct {
|
||||||
|
Offset int64 `json:"offset"`
|
||||||
|
} `json:"loopdevices"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
return result.Loopdevices[0].Offset
|
||||||
|
}
|
||||||
|
|
||||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||||
|
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
|
||||||
|
// typically set up with a non-zero offset (squashfs lives inside the ISO),
|
||||||
|
// so the ioctl returns EINVAL. Detect this early for a clear error message.
|
||||||
|
if off := loopDeviceOffset(loopDev); off > 0 {
|
||||||
|
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
|
||||||
|
}
|
||||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
|
||||||
|
func bindMount(src, dst string) error {
|
||||||
|
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,3 +7,7 @@ import "errors"
|
|||||||
func loopChangeFD(loopDev, newFile string) error {
|
func loopChangeFD(loopDev, newFile string) error {
|
||||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func bindMount(src, dst string) error {
|
||||||
|
return errors.New("bind mount not available on this platform")
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package platform
|
|||||||
import "testing"
|
import "testing"
|
||||||
|
|
||||||
func TestInferLiveBootKind(t *testing.T) {
|
func TestInferLiveBootKind(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
fsType string
|
fsType string
|
||||||
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
|
|||||||
{name: "unknown", source: "overlay", want: "unknown"},
|
{name: "unknown", source: "overlay", want: "unknown"},
|
||||||
}
|
}
|
||||||
for _, tc := range tests {
|
for _, tc := range tests {
|
||||||
|
tc := tc
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||||
if got != tc.want {
|
if got != tc.want {
|
||||||
@@ -26,3 +29,98 @@ func TestInferLiveBootKind(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
dstDir := t.TempDir()
|
||||||
|
|
||||||
|
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
|
||||||
|
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected verification failure when media is still on USB")
|
||||||
|
}
|
||||||
|
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
|
||||||
|
t.Fatalf("error=%q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDescribeLiveBootSource(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
|
||||||
|
t.Fatalf("got %q want RAM", got)
|
||||||
|
}
|
||||||
|
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
|
||||||
|
t.Fatalf("got %q want /run/live/medium", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEvaluateLiveMediaRAMState(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
t.Run("in_ram", func(t *testing.T) {
|
||||||
|
state := evaluateLiveMediaRAMState(
|
||||||
|
LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"},
|
||||||
|
false,
|
||||||
|
nil,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
if state.State != "in_ram" || state.Status != "ok" || state.CanStartCopy {
|
||||||
|
t.Fatalf("state=%+v", state)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("partial_copy_after_cancel", func(t *testing.T) {
|
||||||
|
state := evaluateLiveMediaRAMState(
|
||||||
|
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||||
|
false,
|
||||||
|
[]string{"/run/live/medium/live/filesystem.squashfs", "/run/live/medium/live/firmware.squashfs"},
|
||||||
|
[]string{"/dev/shm/bee-live/filesystem.squashfs"},
|
||||||
|
)
|
||||||
|
if state.State != "partial" || state.Status != "partial" || !state.CanStartCopy {
|
||||||
|
t.Fatalf("state=%+v", state)
|
||||||
|
}
|
||||||
|
if state.CopyComplete {
|
||||||
|
t.Fatalf("CopyComplete=%v want false", state.CopyComplete)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("toram_failed", func(t *testing.T) {
|
||||||
|
state := evaluateLiveMediaRAMState(
|
||||||
|
LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"},
|
||||||
|
true,
|
||||||
|
nil,
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
if state.State != "toram_failed" || state.Status != "failed" || !state.CanStartCopy {
|
||||||
|
t.Fatalf("state=%+v", state)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestShouldLogCopyProgress(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
total := int64(250 * 1024 * 1024)
|
||||||
|
step := int64(100 * 1024 * 1024)
|
||||||
|
|
||||||
|
if shouldLogCopyProgress(step-1, total, 0) {
|
||||||
|
t.Fatal("progress logged too early")
|
||||||
|
}
|
||||||
|
if !shouldLogCopyProgress(step, total, 0) {
|
||||||
|
t.Fatal("expected log at first 100MB boundary")
|
||||||
|
}
|
||||||
|
if shouldLogCopyProgress(step+16*1024*1024, total, step) {
|
||||||
|
t.Fatal("progress logged again before next 100MB")
|
||||||
|
}
|
||||||
|
if !shouldLogCopyProgress(2*step, total, step) {
|
||||||
|
t.Fatal("expected log at second 100MB boundary")
|
||||||
|
}
|
||||||
|
if !shouldLogCopyProgress(total, total, 2*step) {
|
||||||
|
t.Fatal("expected final completion log")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||||
@@ -15,6 +18,11 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
"nvbandwidth",
|
||||||
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
|
"nvvs",
|
||||||
|
"dcgmi",
|
||||||
}
|
}
|
||||||
|
|
||||||
// KilledProcess describes a process that was sent SIGKILL.
|
// KilledProcess describes a process that was sent SIGKILL.
|
||||||
@@ -26,7 +34,12 @@ type KilledProcess struct {
|
|||||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||||
// SIGKILL to each one found. It returns a list of killed processes.
|
// SIGKILL to each one found. It returns a list of killed processes.
|
||||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||||
|
// The scan runs under a 5-second deadline to avoid blocking if the process
|
||||||
|
// table is very large (e.g. after a stress test with thousands of children).
|
||||||
func KillTestWorkers() []KilledProcess {
|
func KillTestWorkers() []KilledProcess {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
entries, err := os.ReadDir("/proc")
|
entries, err := os.ReadDir("/proc")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
@@ -34,6 +47,13 @@ func KillTestWorkers() []KilledProcess {
|
|||||||
|
|
||||||
var killed []KilledProcess
|
var killed []KilledProcess
|
||||||
for _, e := range entries {
|
for _, e := range entries {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
slog.Warn("KillTestWorkers scan timed out", "killed_so_far", len(killed))
|
||||||
|
return killed
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
if !e.IsDir() {
|
if !e.IsDir() {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -52,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
|
|||||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||||
base = exe[idx+1:]
|
base = exe[idx+1:]
|
||||||
}
|
}
|
||||||
for _, pat := range workerPatterns {
|
if shouldKillWorkerProcess(exe, base) {
|
||||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return killed
|
return killed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldKillWorkerProcess(exe, base string) bool {
|
||||||
|
for _, pat := range workerPatterns {
|
||||||
|
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|||||||
39
audit/internal/platform/kill_workers_test.go
Normal file
39
audit/internal/platform/kill_workers_test.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestShouldKillWorkerProcess(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
exe string
|
||||||
|
base string
|
||||||
|
want bool
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "nvbandwidth executable",
|
||||||
|
exe: "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
|
||||||
|
base: "nvbandwidth",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "dcgmi executable",
|
||||||
|
exe: "/usr/bin/dcgmi",
|
||||||
|
base: "dcgmi",
|
||||||
|
want: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unrelated process",
|
||||||
|
exe: "/usr/bin/bash",
|
||||||
|
base: "bash",
|
||||||
|
want: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
|
||||||
|
t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,8 +1,10 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bee/audit/internal/collector"
|
||||||
"bufio"
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"sort"
|
"sort"
|
||||||
@@ -14,13 +16,24 @@ import (
|
|||||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||||
// collected for the web UI metrics page.
|
// collected for the web UI metrics page.
|
||||||
type LiveMetricSample struct {
|
type LiveMetricSample struct {
|
||||||
Timestamp time.Time `json:"ts"`
|
Timestamp time.Time `json:"ts"`
|
||||||
Fans []FanReading `json:"fans"`
|
Fans []FanReading `json:"fans"`
|
||||||
Temps []TempReading `json:"temps"`
|
Temps []TempReading `json:"temps"`
|
||||||
PowerW float64 `json:"power_w"`
|
PowerW float64 `json:"power_w"`
|
||||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
PowerSource string `json:"power_source,omitempty"`
|
||||||
MemLoadPct float64 `json:"mem_load_pct"`
|
PowerMode string `json:"power_mode,omitempty"`
|
||||||
GPUs []GPUMetricRow `json:"gpus"`
|
PowerReason string `json:"power_reason,omitempty"`
|
||||||
|
PSUs []PSUReading `json:"psus,omitempty"`
|
||||||
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||||
|
MemLoadPct float64 `json:"mem_load_pct"`
|
||||||
|
GPUs []GPUMetricRow `json:"gpus"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PSUReading is a per-slot power supply input power reading.
|
||||||
|
type PSUReading struct {
|
||||||
|
Slot int `json:"slot"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
PowerW float64 `json:"power_w"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// TempReading is a named temperature sensor value.
|
// TempReading is a named temperature sensor value.
|
||||||
@@ -54,8 +67,17 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// System power — returns 0 if unavailable
|
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
||||||
s.PowerW = sampleSystemPower()
|
s.PSUs = samplePSUPower()
|
||||||
|
|
||||||
|
// System power: use the global autotune-selected source when configured,
|
||||||
|
// otherwise fall back to the historical heuristic and mark the mode.
|
||||||
|
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
|
||||||
|
s.PowerW = powerW
|
||||||
|
s.PowerSource = decision.EffectiveSource
|
||||||
|
s.PowerMode = decision.Mode
|
||||||
|
s.PowerReason = decision.Reason
|
||||||
|
}
|
||||||
|
|
||||||
// CPU load — from /proc/stat
|
// CPU load — from /proc/stat
|
||||||
s.CPULoadPct = sampleCPULoadPct()
|
s.CPULoadPct = sampleCPULoadPct()
|
||||||
@@ -326,3 +348,46 @@ func compactAmbientTempName(chip, name string) string {
|
|||||||
}
|
}
|
||||||
return chip + " / " + name
|
return chip + " / " + name
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
||||||
|
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
|
||||||
|
// vendors where PSU sensors may not carry entity ID "10.N".
|
||||||
|
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
||||||
|
func samplePSUPower() []PSUReading {
|
||||||
|
out, err := exec.Command("ipmitool", "sdr").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slots := collector.PSUSlotsFromSDR(string(out))
|
||||||
|
if len(slots) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// Collect slot keys and sort for stable output.
|
||||||
|
keys := make([]int, 0, len(slots))
|
||||||
|
for k := range slots {
|
||||||
|
n, err := strconv.Atoi(k)
|
||||||
|
if err == nil {
|
||||||
|
keys = append(keys, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Ints(keys)
|
||||||
|
psus := make([]PSUReading, 0, len(keys))
|
||||||
|
for _, k := range keys {
|
||||||
|
entry := slots[strconv.Itoa(k)]
|
||||||
|
// Prefer AC input power; fall back to DC output power.
|
||||||
|
var w float64
|
||||||
|
if entry.InputW != nil && *entry.InputW > 0 {
|
||||||
|
w = *entry.InputW
|
||||||
|
} else if entry.OutputW != nil && *entry.OutputW > 0 {
|
||||||
|
w = *entry.OutputW
|
||||||
|
}
|
||||||
|
if w <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
|
||||||
|
}
|
||||||
|
if len(psus) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|||||||
51
audit/internal/platform/nvidia_recover.go
Normal file
51
audit/internal/platform/nvidia_recover.go
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const nvidiaRecoverHelper = "/usr/local/bin/bee-nvidia-recover"
|
||||||
|
|
||||||
|
func runNvidiaRecover(args ...string) (string, error) {
|
||||||
|
helperArgs := append([]string{nvidiaRecoverHelper}, args...)
|
||||||
|
if _, err := exec.LookPath("systemd-run"); err == nil {
|
||||||
|
unit := fmt.Sprintf("bee-nvidia-recover-%d", time.Now().UnixNano())
|
||||||
|
cmdArgs := []string{
|
||||||
|
"systemd-run",
|
||||||
|
"--quiet",
|
||||||
|
"--pipe",
|
||||||
|
"--wait",
|
||||||
|
"--collect",
|
||||||
|
"--service-type=oneshot",
|
||||||
|
"--unit", unit,
|
||||||
|
}
|
||||||
|
cmdArgs = append(cmdArgs, helperArgs...)
|
||||||
|
raw, err := exec.Command("sudo", cmdArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||||
|
return string(raw), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func resetNvidiaGPU(index int) (string, error) {
|
||||||
|
if index < 0 {
|
||||||
|
return "", fmt.Errorf("gpu index must be >= 0")
|
||||||
|
}
|
||||||
|
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "GPU reset completed.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func restartNvidiaDrivers() (string, error) {
|
||||||
|
out, err := runNvidiaRecover("restart-drivers")
|
||||||
|
if strings.TrimSpace(out) == "" && err == nil {
|
||||||
|
out = "NVIDIA drivers restarted.\n"
|
||||||
|
}
|
||||||
|
return out, err
|
||||||
|
}
|
||||||
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
job,
|
job,
|
||||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaStressArchivePrefix(loader string) string {
|
func nvidiaStressArchivePrefix(loader string) string {
|
||||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"bee-john-gpu-stress",
|
"bee-john-gpu-stress",
|
||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
|
|||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||||
if gpuCmd == nil {
|
if gpuCmd == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
|
|||||||
}
|
}
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||||
|
|
||||||
// Pack tar.gz
|
return runDir, nil
|
||||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
|
||||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
|
||||||
return "", fmt.Errorf("pack archive: %w", err)
|
|
||||||
}
|
|
||||||
_ = os.RemoveAll(runDir)
|
|
||||||
return archivePath, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// collectPhase samples live metrics every second until ctx is done.
|
// collectPhase samples live metrics every second until ctx is done.
|
||||||
@@ -392,6 +386,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
@@ -402,28 +403,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
|
|
||||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||||
switch strings.ToLower(vendor) {
|
switch strings.ToLower(vendor) {
|
||||||
case "amd":
|
case "amd":
|
||||||
return buildAMDGPUStressCmd(ctx)
|
return buildAMDGPUStressCmd(ctx, durSec)
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return buildNvidiaGPUStressCmd(ctx)
|
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
rvsArgs, err := resolveRVSCommand()
|
rvsArgs, err := resolveRVSCommand()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
rvsPath := rvsArgs[0]
|
rvsPath := rvsArgs[0]
|
||||||
cfg := `actions:
|
cfg := fmt.Sprintf(`actions:
|
||||||
- name: gst_platform
|
- name: gst_platform
|
||||||
device: all
|
device: all
|
||||||
module: gst
|
module: gst
|
||||||
parallel: true
|
parallel: true
|
||||||
duration: 86400000
|
duration: %d`, durSec*1000) + `
|
||||||
copy_matrix: false
|
copy_matrix: false
|
||||||
target_stress: 90
|
target_stress: 90
|
||||||
matrix_size_a: 8640
|
matrix_size_a: 8640
|
||||||
@@ -433,13 +434,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
path, err := satLookPath("bee-gpu-burn")
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
path, err = satLookPath("bee-gpu-stress")
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
@@ -447,7 +455,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||||
|
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||||
|
// where the context is cancelled early (user stop, parent timeout).
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -27,6 +28,8 @@ var runtimeTrackedServices = []string{
|
|||||||
"bee-audit",
|
"bee-audit",
|
||||||
"bee-web",
|
"bee-web",
|
||||||
"bee-sshsetup",
|
"bee-sshsetup",
|
||||||
|
"nvidia-dcgm",
|
||||||
|
"nvidia-fabricmanager",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
@@ -114,6 +117,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
}
|
}
|
||||||
|
|
||||||
s.collectGPURuntimeHealth(vendor, &health)
|
s.collectGPURuntimeHealth(vendor, &health)
|
||||||
|
s.collectToRAMHealth(&health)
|
||||||
|
s.collectUSBExportHealth(&health)
|
||||||
|
|
||||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||||
health.Status = "PARTIAL"
|
health.Status = "PARTIAL"
|
||||||
@@ -135,12 +140,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
case "nvidia":
|
case "nvidia":
|
||||||
tools = append(tools, s.CheckTools([]string{
|
tools = append(tools, s.CheckTools([]string{
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
|
"dcgmi",
|
||||||
|
"nv-hostengine",
|
||||||
"nvidia-bug-report.sh",
|
"nvidia-bug-report.sh",
|
||||||
"bee-gpu-burn",
|
"bee-gpu-burn",
|
||||||
"bee-john-gpu-stress",
|
"bee-john-gpu-stress",
|
||||||
"bee-nccl-gpu-stress",
|
"bee-nccl-gpu-stress",
|
||||||
"all_reduce_perf",
|
"all_reduce_perf",
|
||||||
})...)
|
})...)
|
||||||
|
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
|
||||||
case "amd":
|
case "amd":
|
||||||
tool := ToolStatus{Name: "rocm-smi"}
|
tool := ToolStatus{Name: "rocm-smi"}
|
||||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||||
@@ -155,11 +163,130 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
return tools
|
return tools
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
path, err := exec.LookPath(candidate)
|
||||||
|
if err == nil {
|
||||||
|
return ToolStatus{Name: display, Path: path, OK: true}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ToolStatus{Name: display}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectToRAMHealth evaluates whether the live system is fully running from RAM.
|
||||||
|
// Status values: "ok" = fully in RAM, "warning" = not copied, "partial" = stale or
|
||||||
|
// incomplete RAM copy exists but runtime still depends on the boot medium,
|
||||||
|
// "failed" = toram was requested but medium is not in RAM.
|
||||||
|
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||||
|
state := s.LiveMediaRAMState()
|
||||||
|
health.ToRAMStatus = state.Status
|
||||||
|
switch state.Status {
|
||||||
|
case "ok":
|
||||||
|
return
|
||||||
|
case "failed":
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "toram_copy_failed",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: state.Message,
|
||||||
|
})
|
||||||
|
case "partial":
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "toram_copy_partial",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: state.Message,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
||||||
|
// suitable for log export. Sets USBExportPath to the first match found.
|
||||||
|
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
||||||
|
health.USBExportPath = findUSBExportMount()
|
||||||
|
}
|
||||||
|
|
||||||
|
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
||||||
|
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
||||||
|
// has USB transport. Returns "" if none found.
|
||||||
|
func findUSBExportMount() string {
|
||||||
|
f, err := os.Open("/proc/mounts")
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// fs types that are expected on USB export drives
|
||||||
|
exportFSTypes := map[string]bool{
|
||||||
|
"vfat": true,
|
||||||
|
"exfat": true,
|
||||||
|
"ext2": true,
|
||||||
|
"ext3": true,
|
||||||
|
"ext4": true,
|
||||||
|
"ntfs": true,
|
||||||
|
"ntfs3": true,
|
||||||
|
"fuseblk": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
for scanner.Scan() {
|
||||||
|
// fields: device mountpoint fstype options dump pass
|
||||||
|
fields := strings.Fields(scanner.Text())
|
||||||
|
if len(fields) < 4 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||||
|
if !exportFSTypes[strings.ToLower(fsType)] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Skip read-only mounts
|
||||||
|
opts := strings.Split(options, ",")
|
||||||
|
readOnly := false
|
||||||
|
for _, o := range opts {
|
||||||
|
if strings.TrimSpace(o) == "ro" {
|
||||||
|
readOnly = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if readOnly {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Check USB transport via lsblk on the device (or its parent disk for partitions).
|
||||||
|
if !strings.HasPrefix(device, "/dev/") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
checkDev := device
|
||||||
|
// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
|
||||||
|
// Strip trailing partition digits to get the parent disk name.
|
||||||
|
if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
|
||||||
|
checkDev = trimmed
|
||||||
|
}
|
||||||
|
if blockDeviceTransport(checkDev) == "usb" {
|
||||||
|
return mountPoint
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||||
lsmodText := commandText("lsmod")
|
lsmodText := commandText("lsmod")
|
||||||
|
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||||
|
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_stuck",
|
||||||
|
Severity: "critical",
|
||||||
|
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||||
|
})
|
||||||
|
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_disabled",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||||
if !health.DriverReady {
|
if !health.DriverReady {
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
|||||||
@@ -12,19 +12,68 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"syscall"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Estimated wall-clock durations for each SAT/validate test, derived from real
|
||||||
|
// production logs in _benchmark/_v8/.
|
||||||
|
//
|
||||||
|
// Rule: whenever the commands, timeout parameters, or number of sub-jobs inside
|
||||||
|
// the corresponding Run*Pack function change, re-measure the wall-clock duration
|
||||||
|
// from actual task logs and update the matching constant here.
|
||||||
|
//
|
||||||
|
// Sources:
|
||||||
|
// - SATEstimatedCPUValidateSec: xFusion v8.6 — 62 s
|
||||||
|
// - SATEstimatedMemoryValidateSec: xFusion v8.6 — 68 s
|
||||||
|
// - SATEstimatedNvidiaGPUValidateSec: xFusion v8.6/v8.22 — 77–87 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
|
// - SATEstimatedNvidiaGPUStressSec: xFusion v8.6/v8.22 — 444–448 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
|
// - SATEstimatedNvidiaTargetedStressSec: xFusion v8.6/v8.22 — 347–348 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
|
// - SATEstimatedNvidiaTargetedPowerSec: MSI v8.22 / xFusion v8.6 — 346–351 s/GPU (measured per-GPU; re-measure after switch to all-GPU simultaneous)
|
||||||
|
// - SATEstimatedNvidiaPulseTestSec: xFusion v8.6 — 4 926 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaInterconnectSec: xFusion v8.6/v8.22 — 210–384 s / 8 GPU (all simultaneous)
|
||||||
|
// - SATEstimatedNvidiaBandwidthSec: xFusion v8.6/v8.22 — 2 664–2 688 s / 8 GPU (all simultaneous)
|
||||||
|
const (
|
||||||
|
// CPU stress: stress-ng 60 s + lscpu/sensors overhead.
|
||||||
|
SATEstimatedCPUValidateSec = 65
|
||||||
|
// CPU stress: stress-ng 1800 s (stress mode default).
|
||||||
|
SATEstimatedCPUStressSec = 1800
|
||||||
|
|
||||||
|
// RAM: memtester 256 MB / 1 pass.
|
||||||
|
SATEstimatedMemoryValidateSec = 70
|
||||||
|
// RAM: memtester 512 MB / 1 pass (extrapolated from validate timing, linear with size).
|
||||||
|
SATEstimatedMemoryStressSec = 140
|
||||||
|
|
||||||
|
// NVIDIA dcgmi diag Level 2 (medium), all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaGPUValidateSec = 85
|
||||||
|
// NVIDIA dcgmi diag Level 3 (targeted stress), all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaGPUStressSec = 450
|
||||||
|
|
||||||
|
// NVIDIA dcgmi targeted_stress 300 s + overhead, all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaTargetedStressSec = 350
|
||||||
|
// NVIDIA dcgmi targeted_power 300 s + overhead, all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaTargetedPowerSec = 350
|
||||||
|
|
||||||
|
// NVIDIA dcgmi pulse_test, all GPUs simultaneously (not per-GPU).
|
||||||
|
SATEstimatedNvidiaPulseTestSec = 5000
|
||||||
|
|
||||||
|
// NCCL all_reduce_perf, all GPUs simultaneously.
|
||||||
|
SATEstimatedNvidiaInterconnectSec = 300
|
||||||
|
// nvbandwidth, all GPUs simultaneously. Tool runs all built-in tests
|
||||||
|
// without a user-configurable time limit; duration is determined by nvbandwidth itself.
|
||||||
|
SATEstimatedNvidiaBandwidthSec = 2700
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
satExecCommand = exec.Command
|
satExecCommand = exec.Command
|
||||||
satLookPath = exec.LookPath
|
satLookPath = exec.LookPath
|
||||||
satGlob = filepath.Glob
|
satGlob = filepath.Glob
|
||||||
satStat = os.Stat
|
satStat = os.Stat
|
||||||
|
satFreeMemBytes = freeMemBytes
|
||||||
|
|
||||||
rocmSMIExecutableGlobs = []string{
|
rocmSMIExecutableGlobs = []string{
|
||||||
"/opt/rocm/bin/rocm-smi",
|
"/opt/rocm/bin/rocm-smi",
|
||||||
@@ -38,6 +87,12 @@ var (
|
|||||||
"/opt/rocm/bin/rvs",
|
"/opt/rocm/bin/rvs",
|
||||||
"/opt/rocm-*/bin/rvs",
|
"/opt/rocm-*/bin/rvs",
|
||||||
}
|
}
|
||||||
|
dcgmProfTesterCandidates = []string{
|
||||||
|
"dcgmproftester",
|
||||||
|
"dcgmproftester13",
|
||||||
|
"dcgmproftester12",
|
||||||
|
"dcgmproftester11",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||||
@@ -76,15 +131,46 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
|||||||
|
|
||||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||||
type NvidiaGPU struct {
|
type NvidiaGPU struct {
|
||||||
Index int
|
Index int `json:"index"`
|
||||||
Name string
|
Name string `json:"name"`
|
||||||
MemoryMB int
|
MemoryMB int `json:"memory_mb"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaGPUStatus struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
BDF string `json:"bdf,omitempty"`
|
||||||
|
Serial string `json:"serial,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
RawLine string `json:"raw_line,omitempty"`
|
||||||
|
NeedsReset bool `json:"needs_reset"`
|
||||||
|
ParseFailure bool `json:"parse_failure,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaGPUHealth struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
NeedsReset bool
|
||||||
|
RawLine string
|
||||||
|
ParseFailure bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaGPUStatusFile struct {
|
||||||
|
Index int
|
||||||
|
Name string
|
||||||
|
RunStatus string
|
||||||
|
Reason string
|
||||||
|
Health string
|
||||||
|
HealthRaw string
|
||||||
|
Observed bool
|
||||||
|
Selected bool
|
||||||
|
FailingJob string
|
||||||
}
|
}
|
||||||
|
|
||||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||||
type AMDGPUInfo struct {
|
type AMDGPUInfo struct {
|
||||||
Index int
|
Index int `json:"index"`
|
||||||
Name string
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||||
@@ -256,25 +342,206 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|||||||
MemoryMB: memMB,
|
MemoryMB: memMB,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
|
return gpus[i].Index < gpus[j].Index
|
||||||
|
})
|
||||||
return gpus, nil
|
return gpus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
|
||||||
|
out, err := satExecCommand(
|
||||||
|
"nvidia-smi",
|
||||||
|
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
var gpus []NvidiaGPUStatus
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
if len(parts) < 4 {
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
upper := strings.ToUpper(line)
|
||||||
|
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
|
||||||
|
status := "OK"
|
||||||
|
if needsReset {
|
||||||
|
status = "RESET_REQUIRED"
|
||||||
|
}
|
||||||
|
gpus = append(gpus, NvidiaGPUStatus{
|
||||||
|
Index: idx,
|
||||||
|
Name: strings.TrimSpace(parts[1]),
|
||||||
|
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
|
||||||
|
Serial: strings.TrimSpace(parts[3]),
|
||||||
|
Status: status,
|
||||||
|
RawLine: line,
|
||||||
|
NeedsReset: needsReset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
|
||||||
|
return gpus, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaBusID(v string) string {
|
||||||
|
v = strings.TrimSpace(strings.ToLower(v))
|
||||||
|
parts := strings.Split(v, ":")
|
||||||
|
if len(parts) == 3 && len(parts[0]) > 4 {
|
||||||
|
parts[0] = parts[0][len(parts[0])-4:]
|
||||||
|
return strings.Join(parts, ":")
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||||
|
return resetNvidiaGPU(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
// detect GPU count
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
out, _ := exec.Command("nvidia-smi", "--query-gpu=index", "--format=csv,noheader").Output()
|
if err != nil {
|
||||||
gpuCount := len(strings.Split(strings.TrimSpace(string(out)), "\n"))
|
return "", err
|
||||||
|
}
|
||||||
|
gpuCount := len(selected)
|
||||||
if gpuCount < 1 {
|
if gpuCount < 1 {
|
||||||
gpuCount = 1
|
gpuCount = 1
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
}},
|
}, env: nvidiaVisibleDevicesEnv(selected)},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
var (
|
||||||
|
profCmd []string
|
||||||
|
profEnv []string
|
||||||
|
)
|
||||||
|
if len(selected) > 1 {
|
||||||
|
// For multiple GPUs, always spawn one dcgmproftester process per GPU via
|
||||||
|
// bee-dcgmproftester-staggered (stagger=0 means all start simultaneously).
|
||||||
|
// A single dcgmproftester process without -i only loads GPU 0 regardless
|
||||||
|
// of CUDA_VISIBLE_DEVICES.
|
||||||
|
stagger := staggerSec
|
||||||
|
if stagger < 0 {
|
||||||
|
stagger = 0
|
||||||
|
}
|
||||||
|
profCmd = []string{
|
||||||
|
"bee-dcgmproftester-staggered",
|
||||||
|
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||||
|
"--stagger-seconds", strconv.Itoa(stagger),
|
||||||
|
"--devices", joinIndexList(selected),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||||
|
satJob{
|
||||||
|
name: "03-dcgmproftester.log",
|
||||||
|
cmd: profCmd,
|
||||||
|
env: profEnv,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-targeted-power.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-pulse-test.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-nvbandwidth.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
@@ -293,6 +560,30 @@ func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir
|
|||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-targeted-stress.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
return dedupeSortedIndices(gpuIndices), nil
|
return dedupeSortedIndices(gpuIndices), nil
|
||||||
@@ -307,12 +598,45 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
|||||||
return all, nil
|
return all, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func memoryStressSizeArg() string {
|
||||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
return fmt.Sprintf("%dM", mb)
|
||||||
|
}
|
||||||
|
availBytes := satFreeMemBytes()
|
||||||
|
if availBytes <= 0 {
|
||||||
|
return "80%"
|
||||||
|
}
|
||||||
|
availMB := availBytes / (1024 * 1024)
|
||||||
|
targetMB := (availMB * 2) / 3
|
||||||
|
if targetMB >= 256 {
|
||||||
|
targetMB = (targetMB / 256) * 256
|
||||||
|
}
|
||||||
|
if targetMB <= 0 {
|
||||||
|
return "80%"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dM", targetMB)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
|
if sizeMB <= 0 {
|
||||||
|
sizeMB = 256
|
||||||
|
}
|
||||||
|
if passes <= 0 {
|
||||||
|
passes = 1
|
||||||
|
}
|
||||||
|
// Keep Validate Memory bounded to a quick diagnostic window. The timeout is
|
||||||
|
// intentionally conservative enough for healthy systems while avoiding the
|
||||||
|
// prior 30-80 minute hangs caused by memtester spinning on a bad subtest.
|
||||||
|
timeoutSec := sizeMB*passes*20/100 + 60
|
||||||
|
if timeoutSec < 180 {
|
||||||
|
timeoutSec = 180
|
||||||
|
}
|
||||||
|
if timeoutSec > 900 {
|
||||||
|
timeoutSec = 900
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
{name: "02-memtester.log", cmd: []string{"timeout", fmt.Sprintf("%d", timeoutSec), "memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||||
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
{name: "03-free-after.log", cmd: []string{"free", "-h"}},
|
||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
@@ -322,11 +646,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
|
|||||||
if seconds <= 0 {
|
if seconds <= 0 {
|
||||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||||
}
|
}
|
||||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
// Base the default on current MemAvailable and keep headroom for the OS and
|
||||||
sizeArg := "80%"
|
// concurrent stressors so mixed burn runs do not trip the OOM killer.
|
||||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
sizeArg := memoryStressSizeArg()
|
||||||
sizeArg = fmt.Sprintf("%dM", mb)
|
|
||||||
}
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||||
@@ -368,7 +690,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
if baseDir == "" {
|
if baseDir == "" {
|
||||||
baseDir = "/var/log/bee-sat"
|
baseDir = "/var/log/bee-sat"
|
||||||
}
|
}
|
||||||
@@ -400,7 +722,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||||
commands := storageSATCommands(devPath)
|
commands := storageSATCommands(devPath, extended)
|
||||||
for cmdIndex, job := range commands {
|
for cmdIndex, job := range commands {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
break
|
break
|
||||||
@@ -422,11 +744,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
|||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
return runDir, nil
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type satJob struct {
|
type satJob struct {
|
||||||
@@ -443,14 +761,24 @@ type satStats struct {
|
|||||||
Unsupported int
|
Unsupported int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||||
|
out := make([]satJob, 0, len(jobs)+1)
|
||||||
|
out = append(out, satJob{
|
||||||
|
name: "00-nvidia-smi-persistence-mode.log",
|
||||||
|
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||||
|
})
|
||||||
|
out = append(out, jobs...)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func nvidiaSATJobs() []satJob {
|
func nvidiaSATJobs() []satJob {
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||||
}
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
@@ -465,11 +793,39 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
|||||||
}
|
}
|
||||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||||
}
|
}
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||||
|
args := []string{"dcgmi", "diag", "-r", name}
|
||||||
|
if durationSec > 0 {
|
||||||
|
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
args = append(args, "-i", joinIndexList(gpuIndices))
|
||||||
|
}
|
||||||
|
return args
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaBurnDuration(durationSec int) int {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 300
|
||||||
|
}
|
||||||
|
return durationSec
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||||||
|
if len(gpuIndices) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -489,11 +845,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
|
|
||||||
var summary strings.Builder
|
var summary strings.Builder
|
||||||
stats := satStats{}
|
stats := satStats{}
|
||||||
|
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
|
||||||
|
perGPU := map[int]*nvidiaGPUStatusFile{}
|
||||||
|
selectedGPUIndices := map[int]struct{}{}
|
||||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||||
for _, job := range jobs {
|
for _, job := range jobs {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
for _, idx := range job.gpuIndices {
|
||||||
|
selectedGPUIndices[idx] = struct{}{}
|
||||||
|
status := perGPU[idx]
|
||||||
|
if status == nil {
|
||||||
|
status = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = status
|
||||||
|
}
|
||||||
|
status.Selected = true
|
||||||
|
}
|
||||||
cmd := make([]string, 0, len(job.cmd))
|
cmd := make([]string, 0, len(job.cmd))
|
||||||
for _, arg := range job.cmd {
|
for _, arg := range job.cmd {
|
||||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||||
@@ -502,17 +870,52 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
var out []byte
|
var out []byte
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
if job.collectGPU {
|
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||||
} else {
|
if logFunc != nil {
|
||||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
out = []byte(msg + "\n")
|
||||||
|
err = healthErr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
if job.collectGPU {
|
||||||
|
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||||
|
} else {
|
||||||
|
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||||
|
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(msg)
|
||||||
|
}
|
||||||
|
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
|
||||||
|
out = append(out, '\n')
|
||||||
|
}
|
||||||
|
out = append(out, []byte(msg+"\n")...)
|
||||||
|
if err == nil {
|
||||||
|
err = healthErr
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return "", ctx.Err()
|
||||||
|
}
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
stats.Add(status)
|
stats.Add(status)
|
||||||
|
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
|
||||||
|
for _, idx := range job.gpuIndices {
|
||||||
|
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
|
||||||
|
}
|
||||||
|
}
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||||
@@ -521,12 +924,204 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
if nvidiaPack {
|
||||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
return "", err
|
||||||
return "", err
|
}
|
||||||
}
|
}
|
||||||
return archive, nil
|
|
||||||
|
return runDir, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = entry
|
||||||
|
}
|
||||||
|
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||||
|
entry.RunStatus = status
|
||||||
|
entry.FailingJob = jobName
|
||||||
|
entry.Reason = firstLine(detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
|
||||||
|
health, err := readNvidiaGPUHealth()
|
||||||
|
if err == nil {
|
||||||
|
for _, gpu := range health {
|
||||||
|
entry := perGPU[gpu.Index]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
|
||||||
|
perGPU[gpu.Index] = entry
|
||||||
|
}
|
||||||
|
entry.Name = gpu.Name
|
||||||
|
entry.Observed = true
|
||||||
|
entry.HealthRaw = gpu.RawLine
|
||||||
|
if gpu.NeedsReset {
|
||||||
|
entry.Health = "RESET_REQUIRED"
|
||||||
|
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||||
|
entry.RunStatus = "FAILED"
|
||||||
|
if strings.TrimSpace(entry.Reason) == "" {
|
||||||
|
entry.Reason = "GPU requires reset"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
entry.Health = "OK"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for idx := range selected {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry == nil {
|
||||||
|
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||||
|
perGPU[idx] = entry
|
||||||
|
}
|
||||||
|
entry.Selected = true
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for idx := range perGPU {
|
||||||
|
indices = append(indices, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(indices)
|
||||||
|
for _, idx := range indices {
|
||||||
|
entry := perGPU[idx]
|
||||||
|
if entry.RunStatus == "" {
|
||||||
|
entry.RunStatus = overall
|
||||||
|
}
|
||||||
|
if entry.Health == "" {
|
||||||
|
entry.Health = "UNKNOWN"
|
||||||
|
}
|
||||||
|
if entry.Name == "" {
|
||||||
|
entry.Name = "Unknown GPU"
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||||
|
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
|
||||||
|
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
|
||||||
|
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
|
||||||
|
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
|
||||||
|
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
|
||||||
|
if strings.TrimSpace(entry.FailingJob) != "" {
|
||||||
|
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(entry.Reason) != "" {
|
||||||
|
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(entry.HealthRaw) != "" {
|
||||||
|
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaSATStatusSeverity(status string) int {
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||||
|
case "FAILED":
|
||||||
|
return 3
|
||||||
|
case "PARTIAL", "UNSUPPORTED":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstLine(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
|
||||||
|
return strings.TrimSpace(s[:idx])
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaJobNeedsHealthCheck(job satJob) bool {
|
||||||
|
if job.collectGPU {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
name := strings.ToLower(strings.TrimSpace(job.name))
|
||||||
|
return strings.Contains(name, "dcgmi") ||
|
||||||
|
strings.Contains(name, "gpu-burn") ||
|
||||||
|
strings.Contains(name, "gpu-stress") ||
|
||||||
|
strings.Contains(name, "dcgmproftester")
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkNvidiaJobHealth(selected []int) (string, error) {
|
||||||
|
health, err := readNvidiaGPUHealth()
|
||||||
|
if err != nil {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
var bad []nvidiaGPUHealth
|
||||||
|
selectedSet := make(map[int]struct{}, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
selectedSet[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
for _, gpu := range health {
|
||||||
|
if len(selectedSet) > 0 {
|
||||||
|
if _, ok := selectedSet[gpu.Index]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if gpu.NeedsReset {
|
||||||
|
bad = append(bad, gpu)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(bad) == 0 {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
lines := make([]string, 0, len(bad)+1)
|
||||||
|
lines = append(lines, "NVIDIA GPU health check failed:")
|
||||||
|
for _, gpu := range bad {
|
||||||
|
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
|
||||||
|
}
|
||||||
|
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
|
||||||
|
}
|
||||||
|
|
||||||
|
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
|
||||||
|
out, err := satExecCommand(
|
||||||
|
"nvidia-smi",
|
||||||
|
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
).Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
return parseNvidiaGPUHealth(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
|
||||||
|
var gpus []nvidiaGPUHealth
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
if len(parts) < 2 {
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
|
if err != nil {
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
upper := strings.ToUpper(line)
|
||||||
|
gpus = append(gpus, nvidiaGPUHealth{
|
||||||
|
Index: idx,
|
||||||
|
Name: strings.TrimSpace(parts[1]),
|
||||||
|
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
|
||||||
|
RawLine: line,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return gpus
|
||||||
}
|
}
|
||||||
|
|
||||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||||
@@ -583,17 +1178,25 @@ func listStorageDevices() ([]string, error) {
|
|||||||
return parseStorageDevices(string(out)), nil
|
return parseStorageDevices(string(out)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func storageSATCommands(devPath string) []satJob {
|
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||||
|
selfTestLevel := "1"
|
||||||
|
if extended {
|
||||||
|
selfTestLevel = "2"
|
||||||
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
smartTestType := "short"
|
||||||
|
if extended {
|
||||||
|
smartTestType = "long"
|
||||||
|
}
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -642,6 +1245,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
}
|
}
|
||||||
if strings.Contains(text, "unsupported") ||
|
if strings.Contains(text, "unsupported") ||
|
||||||
strings.Contains(text, "not supported") ||
|
strings.Contains(text, "not supported") ||
|
||||||
|
strings.Contains(text, "not found in path") ||
|
||||||
strings.Contains(text, "invalid opcode") ||
|
strings.Contains(text, "invalid opcode") ||
|
||||||
strings.Contains(text, "unknown command") ||
|
strings.Contains(text, "unknown command") ||
|
||||||
strings.Contains(text, "not implemented") ||
|
strings.Contains(text, "not implemented") ||
|
||||||
@@ -651,6 +1255,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
// nvidia-smi on a machine with no NVIDIA GPU
|
// nvidia-smi on a machine with no NVIDIA GPU
|
||||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||||
strings.Contains(text, "no nvidia gpu") ||
|
strings.Contains(text, "no nvidia gpu") ||
|
||||||
|
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
|
||||||
|
// while waiting, so the CLI stops polling without proving device failure.
|
||||||
|
(strings.Contains(name, "self-test") &&
|
||||||
|
strings.Contains(text, "no progress for") &&
|
||||||
|
strings.Contains(text, "stop waiting")) ||
|
||||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||||
return "UNSUPPORTED", rc
|
return "UNSUPPORTED", rc
|
||||||
}
|
}
|
||||||
@@ -748,6 +1357,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
|||||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
|
||||||
|
for _, candidate := range dcgmProfTesterCandidates {
|
||||||
|
if path, err := satLookPath(candidate); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, errors.New("dcgmproftester not found in PATH")
|
||||||
|
}
|
||||||
|
|
||||||
func ensureAMDRuntimeReady() error {
|
func ensureAMDRuntimeReady() error {
|
||||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
return nil
|
return nil
|
||||||
@@ -846,8 +1464,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
|||||||
if len(metricRows) > 0 {
|
if len(metricRows) > 0 {
|
||||||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||||
chart := RenderGPUTerminalChart(metricRows)
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return out, err
|
return out, err
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -20,7 +21,7 @@ type FanStressOptions struct {
|
|||||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||||
PauseSec int // pause between the two load phases (default 60)
|
PauseSec int // pause between the two load phases (default 60)
|
||||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,27 +43,56 @@ type GPUStressMetric struct {
|
|||||||
|
|
||||||
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
|
||||||
type FanStressRow struct {
|
type FanStressRow struct {
|
||||||
TimestampUTC string
|
TimestampUTC string
|
||||||
ElapsedSec float64
|
ElapsedSec float64
|
||||||
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
|
||||||
GPUs []GPUStressMetric
|
GPUs []GPUStressMetric
|
||||||
Fans []FanReading
|
Fans []FanReading
|
||||||
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
|
||||||
SysPowerW float64 // DCMI system power reading
|
SysPowerW float64
|
||||||
|
SysPowerSource string
|
||||||
|
SysPowerMode string
|
||||||
}
|
}
|
||||||
|
|
||||||
type cachedPowerReading struct {
|
type cachedPowerReading struct {
|
||||||
Value float64
|
Value float64
|
||||||
|
Source string
|
||||||
|
Mode string
|
||||||
|
Reason string
|
||||||
UpdatedAt time.Time
|
UpdatedAt time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type fanObservationState struct {
|
||||||
|
MaxRPM map[string]float64 `json:"max_rpm"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type fanPeakCandidate struct {
|
||||||
|
FirstSeen time.Time
|
||||||
|
RPM float64
|
||||||
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
systemPowerCacheMu sync.Mutex
|
systemPowerCacheMu sync.Mutex
|
||||||
systemPowerCache cachedPowerReading
|
systemPowerCache cachedPowerReading
|
||||||
|
fanObservationMu sync.Mutex
|
||||||
|
fanObservation fanObservationState
|
||||||
|
fanObservationInit bool
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
)
|
)
|
||||||
|
|
||||||
const systemPowerHoldTTL = 15 * time.Second
|
const systemPowerHoldTTL = 15 * time.Second
|
||||||
|
|
||||||
|
var fanObservationStatePath = "/var/log/bee-sat/fan-observation.json"
|
||||||
|
|
||||||
|
const fanObservationMinPeakHold = time.Second
|
||||||
|
|
||||||
|
func normalizeObservedFanMaxRPM(rpm float64) float64 {
|
||||||
|
if rpm <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return math.Ceil(rpm/1000.0) * 1000.0
|
||||||
|
}
|
||||||
|
|
||||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
@@ -223,11 +253,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
return runDir, nil
|
||||||
if err := createTarGz(archive, runDir); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return archive, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||||
@@ -243,9 +269,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
|||||||
if opts.Phase2DurSec <= 0 {
|
if opts.Phase2DurSec <= 0 {
|
||||||
opts.Phase2DurSec = 300
|
opts.Phase2DurSec = 300
|
||||||
}
|
}
|
||||||
if opts.SizeMB <= 0 {
|
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||||
opts.SizeMB = 64
|
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||||
@@ -258,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
|
|||||||
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
row.GPUs = sampleGPUStressMetrics(gpuIndices)
|
||||||
row.Fans, _ = sampleFanSpeeds()
|
row.Fans, _ = sampleFanSpeeds()
|
||||||
row.CPUMaxTempC = sampleCPUMaxTemp()
|
row.CPUMaxTempC = sampleCPUMaxTemp()
|
||||||
row.SysPowerW = sampleSystemPower()
|
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
|
||||||
return row
|
return row
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -315,11 +340,13 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
|||||||
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
out, err := exec.Command("ipmitool", "sdr", "type", "Fan").Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
if fans := parseFanSpeeds(string(out)); len(fans) > 0 {
|
||||||
|
updateFanObservation(fans, time.Now())
|
||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
fans, sensorsErr := sampleFanSpeedsViaSensorsJSON()
|
||||||
if len(fans) > 0 {
|
if len(fans) > 0 {
|
||||||
|
updateFanObservation(fans, time.Now())
|
||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -328,6 +355,119 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
|||||||
return nil, sensorsErr
|
return nil, sensorsErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadFanObservationLocked() {
|
||||||
|
if fanObservationInit {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanObservationInit = true
|
||||||
|
fanObservation.MaxRPM = make(map[string]float64)
|
||||||
|
raw, err := os.ReadFile(fanObservationStatePath)
|
||||||
|
if err != nil || len(raw) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var persisted fanObservationState
|
||||||
|
if json.Unmarshal(raw, &persisted) != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for name, rpm := range persisted.MaxRPM {
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if name == "" || rpm <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanObservation.MaxRPM[name] = rpm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func saveFanObservationLocked() {
|
||||||
|
if len(fanObservation.MaxRPM) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
dir := filepath.Dir(fanObservationStatePath)
|
||||||
|
if dir == "" || dir == "." {
|
||||||
|
dir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
raw, err := json.MarshalIndent(fanObservation, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(fanObservationStatePath, raw, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateFanObservation(fans []FanReading, now time.Time) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanObservationMu.Lock()
|
||||||
|
defer fanObservationMu.Unlock()
|
||||||
|
loadFanObservationLocked()
|
||||||
|
changed := false
|
||||||
|
for _, fan := range fans {
|
||||||
|
name := strings.TrimSpace(fan.Name)
|
||||||
|
if name == "" || fan.RPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
currentMax := fanObservation.MaxRPM[name]
|
||||||
|
if fan.RPM <= currentMax {
|
||||||
|
delete(fanPeakCandidates, name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cand, ok := fanPeakCandidates[name]; ok {
|
||||||
|
if now.Sub(cand.FirstSeen) >= fanObservationMinPeakHold {
|
||||||
|
newMax := math.Max(cand.RPM, fan.RPM)
|
||||||
|
if newMax > currentMax {
|
||||||
|
fanObservation.MaxRPM[name] = normalizeObservedFanMaxRPM(newMax)
|
||||||
|
changed = true
|
||||||
|
}
|
||||||
|
delete(fanPeakCandidates, name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if fan.RPM > cand.RPM {
|
||||||
|
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: cand.FirstSeen, RPM: fan.RPM}
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanPeakCandidates[name] = fanPeakCandidate{FirstSeen: now, RPM: fan.RPM}
|
||||||
|
}
|
||||||
|
if changed {
|
||||||
|
saveFanObservationLocked()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func estimateFanDutyCyclePctFromObservation(fans []FanReading) (float64, bool) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
fanObservationMu.Lock()
|
||||||
|
defer fanObservationMu.Unlock()
|
||||||
|
loadFanObservationLocked()
|
||||||
|
var samples []float64
|
||||||
|
for _, fan := range fans {
|
||||||
|
name := strings.TrimSpace(fan.Name)
|
||||||
|
if name == "" || fan.RPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
maxRPM := fanObservation.MaxRPM[name]
|
||||||
|
if maxRPM <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
pct := fan.RPM / maxRPM * 100.0
|
||||||
|
if pct > 100 {
|
||||||
|
pct = 100
|
||||||
|
}
|
||||||
|
if pct < 0 {
|
||||||
|
pct = 0
|
||||||
|
}
|
||||||
|
samples = append(samples, pct)
|
||||||
|
}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
// Handles two formats:
|
// Handles two formats:
|
||||||
//
|
//
|
||||||
@@ -431,6 +571,116 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
|||||||
return fans, nil
|
return fans, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||||
|
// Returns the average duty cycle across all exposed PWM controls.
|
||||||
|
func sampleFanDutyCyclePct() (float64, bool, bool) {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil || len(out) == 0 {
|
||||||
|
fans, fanErr := sampleFanSpeeds()
|
||||||
|
if fanErr != nil {
|
||||||
|
return 0, false, false
|
||||||
|
}
|
||||||
|
return sampleFanDutyCyclePctFromFans(fans)
|
||||||
|
}
|
||||||
|
pct, ok := parseFanDutyCyclePctSensorsJSON(out)
|
||||||
|
return pct, ok, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleFanDutyCyclePctFromFans(fans []FanReading) (float64, bool, bool) {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0, false, false
|
||||||
|
}
|
||||||
|
if pct, ok := estimateFanDutyCyclePctFromObservation(fans); ok {
|
||||||
|
return pct, true, true
|
||||||
|
}
|
||||||
|
return 0, false, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||||
|
var doc map[string]map[string]any
|
||||||
|
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
var samples []float64
|
||||||
|
for _, features := range doc {
|
||||||
|
for name, feature := range features {
|
||||||
|
if strings.EqualFold(name, "Adapter") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
featureMap, ok := feature.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if duty, ok := firstFanDutyValue(name, featureMap); ok {
|
||||||
|
samples = append(samples, duty)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
|
||||||
|
featureName = strings.ToLower(strings.TrimSpace(featureName))
|
||||||
|
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if strings.Contains(featureName, "pwm") {
|
||||||
|
for _, key := range []string{"input", "value", "current"} {
|
||||||
|
if value, ok := feature[key]; ok {
|
||||||
|
if duty, parsed := parseFanDutyValue(value); parsed {
|
||||||
|
return duty, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if !strings.Contains(lower, "pwm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
|
||||||
|
return duty, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFanDutyValue(value any) (float64, bool) {
|
||||||
|
switch v := value.(type) {
|
||||||
|
case float64:
|
||||||
|
return normalizePWMAsDutyPct(v)
|
||||||
|
case string:
|
||||||
|
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
|
||||||
|
return normalizePWMAsDutyPct(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
|
||||||
|
if raw < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if raw <= 100 {
|
||||||
|
return raw, true
|
||||||
|
}
|
||||||
|
if raw <= 255 {
|
||||||
|
return raw / 255.0 * 100.0, true
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||||
keys := make([]string, 0, len(feature))
|
keys := make([]string, 0, len(feature))
|
||||||
for key := range feature {
|
for key := range feature {
|
||||||
@@ -518,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
|
|||||||
return max
|
return max
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleSystemPower reads system power draw via DCMI.
|
// sampleSystemPowerResolved reads system power via the global autotune source,
|
||||||
func sampleSystemPower() float64 {
|
// falling back to the historical heuristic before autotune or when degraded.
|
||||||
|
func sampleSystemPowerResolved() (float64, string, string) {
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
current := 0.0
|
current, decision, err := SampleSystemPowerResolved("")
|
||||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
|
||||||
if err == nil {
|
|
||||||
current = parseDCMIPowerReading(string(out))
|
|
||||||
}
|
|
||||||
systemPowerCacheMu.Lock()
|
systemPowerCacheMu.Lock()
|
||||||
defer systemPowerCacheMu.Unlock()
|
defer systemPowerCacheMu.Unlock()
|
||||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
if err != nil {
|
||||||
|
current = 0
|
||||||
|
}
|
||||||
|
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
|
||||||
systemPowerCache = updated
|
systemPowerCache = updated
|
||||||
return value
|
return value, updated.Source, updated.Mode
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
@@ -553,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
|
||||||
if current > 0 {
|
if current > 0 {
|
||||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
|
||||||
return current, cache
|
return current, cache
|
||||||
}
|
}
|
||||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@@ -29,6 +30,74 @@ func TestFirstFanInputValue(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||||
|
raw := []byte(`{
|
||||||
|
"chip0": {
|
||||||
|
"fan1": {"input": 9000},
|
||||||
|
"pwm1": {"input": 128},
|
||||||
|
"pwm1_enable": {"input": 1}
|
||||||
|
},
|
||||||
|
"chip1": {
|
||||||
|
"pwm2": {"input": 64}
|
||||||
|
}
|
||||||
|
}`)
|
||||||
|
|
||||||
|
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected duty cycle telemetry to be parsed")
|
||||||
|
}
|
||||||
|
if got < 57 || got > 58 {
|
||||||
|
t.Fatalf("got=%v want ~57.1", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEstimateFanDutyCyclePctFromObservation(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldPath := fanObservationStatePath
|
||||||
|
oldState := fanObservation
|
||||||
|
oldInit := fanObservationInit
|
||||||
|
oldCandidates := fanPeakCandidates
|
||||||
|
fanObservationStatePath = filepath.Join(t.TempDir(), "fan-observation.json")
|
||||||
|
fanObservation = fanObservationState{}
|
||||||
|
fanObservationInit = false
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
fanObservationStatePath = oldPath
|
||||||
|
fanObservation = oldState
|
||||||
|
fanObservationInit = oldInit
|
||||||
|
fanPeakCandidates = oldCandidates
|
||||||
|
})
|
||||||
|
|
||||||
|
start := time.Unix(100, 0)
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5000}}, start)
|
||||||
|
if _, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2500}}); ok {
|
||||||
|
t.Fatalf("single-sample spike should not establish observed max")
|
||||||
|
}
|
||||||
|
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5200}}, start.Add(500*time.Millisecond))
|
||||||
|
updateFanObservation([]FanReading{{Name: "FAN1", RPM: 5100}}, start.Add(1500*time.Millisecond))
|
||||||
|
|
||||||
|
got, ok := estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected estimated duty cycle from persisted observed max")
|
||||||
|
}
|
||||||
|
if got < 43 || got > 44 {
|
||||||
|
t.Fatalf("got=%v want ~43.3", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
fanObservation = fanObservationState{}
|
||||||
|
fanObservationInit = false
|
||||||
|
fanPeakCandidates = make(map[string]fanPeakCandidate)
|
||||||
|
got, ok = estimateFanDutyCyclePctFromObservation([]FanReading{{Name: "FAN1", RPM: 2600}})
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("expected persisted observed max to be reloaded from disk")
|
||||||
|
}
|
||||||
|
if got < 43 || got > 44 {
|
||||||
|
t.Fatalf("reloaded got=%v want ~43.3", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseDCMIPowerReading(t *testing.T) {
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
raw := `
|
raw := `
|
||||||
Instantaneous power reading: 512 Watts
|
Instantaneous power reading: 512 Watts
|
||||||
@@ -43,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||||
|
|
||||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
|
||||||
if got != 480 {
|
if got != 480 {
|
||||||
t.Fatalf("got=%v want cached 480", got)
|
t.Fatalf("got=%v want cached 480", got)
|
||||||
}
|
}
|
||||||
@@ -51,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
t.Fatalf("updated=%+v", updated)
|
t.Fatalf("updated=%+v", updated)
|
||||||
}
|
}
|
||||||
|
|
||||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
|
||||||
if got != 530 {
|
if got != 530 {
|
||||||
t.Fatalf("got=%v want 530", got)
|
t.Fatalf("got=%v want 530", got)
|
||||||
}
|
}
|
||||||
@@ -60,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
|
||||||
if got != 0 {
|
if got != 0 {
|
||||||
t.Fatalf("expired cache returned %v want 0", got)
|
t.Fatalf("expired cache returned %v want 0", got)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,23 +1,25 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestStorageSATCommands(t *testing.T) {
|
func TestStorageSATCommands(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
nvme := storageSATCommands("/dev/nvme0n1")
|
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||||
}
|
}
|
||||||
|
|
||||||
sata := storageSATCommands("/dev/sda")
|
sata := storageSATCommands("/dev/sda", false)
|
||||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||||
}
|
}
|
||||||
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
|
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
|
|
||||||
if len(jobs) != 5 {
|
if len(jobs) != 6 {
|
||||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||||
}
|
}
|
||||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||||
|
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||||
}
|
}
|
||||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
|||||||
|
|
||||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
got := jobs[4].cmd
|
got := jobs[5].cmd
|
||||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||||
if len(got) != len(want) {
|
if len(got) != len(want) {
|
||||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||||
|
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||||
|
if len(jobs) != 5 {
|
||||||
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||||
|
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -195,6 +216,137 @@ func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||||
|
if len(got) != 2 {
|
||||||
|
t.Fatalf("len=%d want 2", len(got))
|
||||||
|
}
|
||||||
|
if got[0].NeedsReset {
|
||||||
|
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||||
|
}
|
||||||
|
if !got[1].NeedsReset {
|
||||||
|
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
msg, err := checkNvidiaJobHealth([]int{1})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected health check error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||||
|
t.Fatalf("unexpected message: %q", msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||||
|
0: {Index: 0, RunStatus: "OK"},
|
||||||
|
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||||
|
}
|
||||||
|
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||||
|
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||||
|
}
|
||||||
|
text := string(raw)
|
||||||
|
if !strings.Contains(text, "run_status=FAILED") {
|
||||||
|
t.Fatalf("missing run status:\n%s", text)
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||||
|
t.Fatalf("missing health status:\n%s", text)
|
||||||
|
}
|
||||||
|
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||||
|
t.Fatalf("missing failing job:\n%s", text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
switch file {
|
||||||
|
case "dcgmproftester13":
|
||||||
|
return "/usr/bin/dcgmproftester13", nil
|
||||||
|
default:
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 4 {
|
||||||
|
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != "/usr/bin/dcgmproftester13" {
|
||||||
|
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||||
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
|
||||||
|
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
|
||||||
|
if len(cmd) != len(want) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if cmd[i] != want[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMNamedDiagCommandSkipsDurationForNVBandwidth(t *testing.T) {
|
||||||
|
cmd := nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, []int{2, 0})
|
||||||
|
want := []string{"dcgmi", "diag", "-r", "nvbandwidth", "-i", "2,0"}
|
||||||
|
if len(cmd) != len(want) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if cmd[i] != want[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||||
|
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||||
|
if len(env) != 2 {
|
||||||
|
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||||
|
}
|
||||||
|
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||||
|
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||||
|
}
|
||||||
|
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||||
|
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -229,6 +381,37 @@ func TestEnvIntFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "65536M" {
|
||||||
|
t.Fatalf("sizeArg=%q want 65536M", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "4096M" {
|
||||||
|
t.Fatalf("sizeArg=%q want 4096M", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 0 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "80%" {
|
||||||
|
t.Fatalf("sizeArg=%q want 80%%", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestClassifySATResult(t *testing.T) {
|
func TestClassifySATResult(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
@@ -239,6 +422,7 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
|
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
}
|
}
|
||||||
@@ -253,6 +437,38 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
t.Cleanup(cancel)
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||||
|
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||||
|
}, nil)
|
||||||
|
<-done
|
||||||
|
|
||||||
|
if !errors.Is(err, context.Canceled) {
|
||||||
|
t.Fatalf("err=%v want context.Canceled", err)
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
t.Fatalf("archive=%q want empty", archive)
|
||||||
|
}
|
||||||
|
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||||
|
if globErr != nil {
|
||||||
|
t.Fatalf("Glob error: %v", globErr)
|
||||||
|
}
|
||||||
|
if len(matches) != 0 {
|
||||||
|
t.Fatalf("archives=%v want none", matches)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -10,17 +10,30 @@ import (
|
|||||||
func (s *System) ListBeeServices() ([]string, error) {
|
func (s *System) ListBeeServices() ([]string, error) {
|
||||||
seen := map[string]bool{}
|
seen := map[string]bool{}
|
||||||
var out []string
|
var out []string
|
||||||
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
for _, pattern := range []string{
|
||||||
|
"/etc/systemd/system/bee-*.service",
|
||||||
|
"/lib/systemd/system/bee-*.service",
|
||||||
|
"/etc/systemd/system/bee-*.timer",
|
||||||
|
"/lib/systemd/system/bee-*.timer",
|
||||||
|
} {
|
||||||
matches, err := filepath.Glob(pattern)
|
matches, err := filepath.Glob(pattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
base := filepath.Base(match)
|
||||||
|
name := base
|
||||||
|
if strings.HasSuffix(base, ".service") {
|
||||||
|
name = strings.TrimSuffix(base, ".service")
|
||||||
|
}
|
||||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||||
if strings.HasSuffix(name, "@") {
|
if strings.HasSuffix(name, "@") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
|
||||||
|
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if !seen[name] {
|
if !seen[name] {
|
||||||
seen[name] = true
|
seen[name] = true
|
||||||
out = append(out, name)
|
out = append(out, name)
|
||||||
@@ -48,7 +61,12 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
if name == "bee-nvidia" && action == ServiceRestart {
|
||||||
|
return restartNvidiaDrivers()
|
||||||
|
}
|
||||||
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
|
|||||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||||
|
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
|
||||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||||
|
|||||||
@@ -9,6 +9,17 @@ type LiveBootSource struct {
|
|||||||
Device string `json:"device,omitempty"`
|
Device string `json:"device,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LiveMediaRAMState struct {
|
||||||
|
LiveBootSource
|
||||||
|
State string `json:"state"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
ToramActive bool `json:"toram_active,omitempty"`
|
||||||
|
CopyPresent bool `json:"copy_present,omitempty"`
|
||||||
|
CopyComplete bool `json:"copy_complete,omitempty"`
|
||||||
|
CanStartCopy bool `json:"can_start_copy,omitempty"`
|
||||||
|
Message string `json:"message,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type InterfaceInfo struct {
|
type InterfaceInfo struct {
|
||||||
Name string
|
Name string
|
||||||
State string
|
State string
|
||||||
@@ -44,12 +55,12 @@ type StaticIPv4Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type RemovableTarget struct {
|
type RemovableTarget struct {
|
||||||
Device string
|
Device string `json:"device"`
|
||||||
FSType string
|
FSType string `json:"fs_type"`
|
||||||
Size string
|
Size string `json:"size"`
|
||||||
Label string
|
Label string `json:"label"`
|
||||||
Model string
|
Model string `json:"model"`
|
||||||
Mountpoint string
|
Mountpoint string `json:"mountpoint"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ToolStatus struct {
|
type ToolStatus struct {
|
||||||
@@ -70,6 +81,7 @@ type NvidiaStressOptions struct {
|
|||||||
Loader string
|
Loader string
|
||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
|
StaggerSeconds int
|
||||||
}
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
|
|||||||
31
audit/internal/platform/types_test.go
Normal file
31
audit/internal/platform/types_test.go
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
data, err := json.Marshal(RemovableTarget{
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
FSType: "exfat",
|
||||||
|
Size: "1.8T",
|
||||||
|
Label: "USB",
|
||||||
|
Model: "Flash",
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal: %v", err)
|
||||||
|
}
|
||||||
|
raw := string(data)
|
||||||
|
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
|
||||||
|
if !strings.Contains(raw, key) {
|
||||||
|
t.Fatalf("json missing key %s: %s", key, raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
|
||||||
|
t.Fatalf("json still contains Go field names: %s", raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -15,12 +15,17 @@ type HardwareIngestRequest struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type RuntimeHealth struct {
|
type RuntimeHealth struct {
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
CheckedAt string `json:"checked_at"`
|
CheckedAt string `json:"checked_at"`
|
||||||
ExportDir string `json:"export_dir,omitempty"`
|
ExportDir string `json:"export_dir,omitempty"`
|
||||||
DriverReady bool `json:"driver_ready,omitempty"`
|
DriverReady bool `json:"driver_ready,omitempty"`
|
||||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||||
NetworkStatus string `json:"network_status,omitempty"`
|
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||||
|
NetworkStatus string `json:"network_status,omitempty"`
|
||||||
|
// ToRAMStatus: "ok" (fully in RAM), "warning" (not copied), "partial" (stale/incomplete copy exists), "failed" (toram active but copy failed)
|
||||||
|
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||||
|
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||||
|
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||||
@@ -182,6 +187,13 @@ type HardwarePCIeDevice struct {
|
|||||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||||
|
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||||
|
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||||
|
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||||
|
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||||
|
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||||
|
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||||
|
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,10 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
@@ -9,30 +12,6 @@ import (
|
|||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
|
||||||
t.Setenv("DISPLAY", "")
|
|
||||||
t.Setenv("XAUTHORITY", "")
|
|
||||||
|
|
||||||
cmd := xrandrCommand("--query")
|
|
||||||
|
|
||||||
var hasDisplay bool
|
|
||||||
var hasXAuthority bool
|
|
||||||
for _, kv := range cmd.Env {
|
|
||||||
if kv == "DISPLAY=:0" {
|
|
||||||
hasDisplay = true
|
|
||||||
}
|
|
||||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
|
||||||
hasXAuthority = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !hasDisplay {
|
|
||||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
|
||||||
}
|
|
||||||
if !hasXAuthority {
|
|
||||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
@@ -62,8 +41,311 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
|||||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||||
t.Fatalf("burn profile=%q want smoke", got)
|
t.Fatalf("burn profile=%q want smoke", got)
|
||||||
}
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||||
|
|
||||||
|
h.handleAPIBlackboxStatus(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var state app.BlackboxState
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
|
||||||
|
t.Fatalf("decode state: %v", err)
|
||||||
|
}
|
||||||
|
if state.Status != "disabled" {
|
||||||
|
t.Fatalf("status=%q want disabled", state.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||||
|
exportDir := t.TempDir()
|
||||||
|
statePath := filepath.Join(exportDir, "blackbox-state.json")
|
||||||
|
if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
|
||||||
|
t.Fatalf("write state: %v", err)
|
||||||
|
}
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||||
|
|
||||||
|
h.handleAPIBlackboxStatus(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
|
||||||
|
t.Fatalf("body=%s", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
task := globalQueue.tasks[0]
|
||||||
|
if task.Target != "nvidia-bench-perf" {
|
||||||
|
t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
|
||||||
|
}
|
||||||
|
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||||
|
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||||
|
}
|
||||||
|
if task.params.RunNCCL {
|
||||||
|
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||||
|
}
|
||||||
|
if task.Priority != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var resp taskRunResponse
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||||
|
t.Fatalf("decode response: %v", err)
|
||||||
|
}
|
||||||
|
if len(resp.TaskIDs) != 2 {
|
||||||
|
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H100 PCIe"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
// Ramp-up mode creates a single task that handles the 1→N GPU ramp internally
|
||||||
|
// (spawning N separate tasks would redundantly repeat all earlier ramp steps).
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1 (ramp-up uses single task)", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
task := globalQueue.tasks[0]
|
||||||
|
if task.Target != "nvidia-bench-power" {
|
||||||
|
t.Fatalf("task target=%q want nvidia-bench-power", task.Target)
|
||||||
|
}
|
||||||
|
if task.Priority != taskPriorityBenchmark {
|
||||||
|
t.Fatalf("task priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||||
|
}
|
||||||
|
if task.params.RampTotal != 3 {
|
||||||
|
t.Fatalf("task RampTotal=%d want 3", task.params.RampTotal)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkAutotuneRunQueuesTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/autotune/run", strings.NewReader(`{"profile":"standard","benchmark_kind":"power-fit"}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkAutotuneRun().ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
task := globalQueue.tasks[0]
|
||||||
|
if task.Target != "nvidia-bench-autotune" {
|
||||||
|
t.Fatalf("task target=%q want nvidia-bench-autotune", task.Target)
|
||||||
|
}
|
||||||
|
if task.params.BenchmarkKind != "power-fit" {
|
||||||
|
t.Fatalf("task benchmark kind=%q want power-fit", task.params.BenchmarkKind)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||||
|
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||||
|
got := []int{
|
||||||
|
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||||
|
defaultTaskPriority("audit", taskParams{}),
|
||||||
|
defaultTaskPriority("cpu", taskParams{}),
|
||||||
|
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||||
|
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||||
|
defaultTaskPriority("nvidia-bench-perf", taskParams{}),
|
||||||
|
defaultTaskPriority("nvidia-bench-power", taskParams{}),
|
||||||
|
}
|
||||||
|
want := []int{
|
||||||
|
taskPriorityInstallToRAM,
|
||||||
|
taskPriorityAudit,
|
||||||
|
taskPriorityValidate,
|
||||||
|
taskPriorityValidateStress,
|
||||||
|
taskPriorityBurn,
|
||||||
|
taskPriorityBenchmark,
|
||||||
|
taskPriorityBenchmark,
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
|
||||||
|
t.Fatalf("priority order=%v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
h := &handler{}
|
h := &handler{}
|
||||||
|
|||||||
992
audit/internal/webui/charts_svg.go
Normal file
992
audit/internal/webui/charts_svg.go
Normal file
@@ -0,0 +1,992 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
type chartTimelineSegment struct {
|
||||||
|
Start time.Time
|
||||||
|
End time.Time
|
||||||
|
Active bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type chartScale struct {
|
||||||
|
Min float64
|
||||||
|
Max float64
|
||||||
|
Ticks []float64
|
||||||
|
}
|
||||||
|
|
||||||
|
type chartLayout struct {
|
||||||
|
Width int
|
||||||
|
Height int
|
||||||
|
PlotLeft int
|
||||||
|
PlotRight int
|
||||||
|
PlotTop int
|
||||||
|
PlotBottom int
|
||||||
|
}
|
||||||
|
|
||||||
|
type metricChartSeries struct {
|
||||||
|
Name string
|
||||||
|
AxisTitle string
|
||||||
|
Color string
|
||||||
|
Values []float64
|
||||||
|
}
|
||||||
|
|
||||||
|
var metricChartPalette = []string{
|
||||||
|
"#5794f2",
|
||||||
|
"#73bf69",
|
||||||
|
"#f2cc0c",
|
||||||
|
"#ff9830",
|
||||||
|
"#f2495c",
|
||||||
|
"#b877d9",
|
||||||
|
"#56d2f7",
|
||||||
|
"#8ab8ff",
|
||||||
|
"#9adf8f",
|
||||||
|
"#ffbe5c",
|
||||||
|
}
|
||||||
|
|
||||||
|
var gpuLabelCache struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
loadedAt time.Time
|
||||||
|
byIndex map[int]string
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{time.Time{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range datasets {
|
||||||
|
if len(datasets[i]) == 0 {
|
||||||
|
datasets[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downsample to at most ~1400 points (one per pixel) before building SVG.
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
|
||||||
|
statsLabel := chartStatsLabel(datasets)
|
||||||
|
|
||||||
|
legendItems := []metricChartSeries{}
|
||||||
|
for i, name := range names {
|
||||||
|
color := metricChartPalette[i%len(metricChartPalette)]
|
||||||
|
values := make([]float64, pointCount)
|
||||||
|
if i < len(datasets) {
|
||||||
|
copy(values, coalesceDataset(datasets[i], pointCount))
|
||||||
|
}
|
||||||
|
legendItems = append(legendItems, metricChartSeries{
|
||||||
|
Name: name,
|
||||||
|
Color: color,
|
||||||
|
Values: values,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
scale := singleAxisChartScale(datasets, yMin, yMax)
|
||||||
|
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
writeSingleAxisY(&b, layout, scale)
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
for _, item := range legendItems {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
|
||||||
|
}
|
||||||
|
writeLegend(&b, layout, legendItems)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
|
||||||
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
|
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
|
if temp == nil && power == nil && coreClock == nil {
|
||||||
|
return nil, false, nil
|
||||||
|
}
|
||||||
|
labels := sampleTimeLabels(samples)
|
||||||
|
times := sampleTimes(samples)
|
||||||
|
svg, err := drawGPUOverviewChartSVG(
|
||||||
|
gpuDisplayLabel(idx)+" Overview",
|
||||||
|
labels,
|
||||||
|
times,
|
||||||
|
[]metricChartSeries{
|
||||||
|
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
|
||||||
|
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
|
||||||
|
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
|
||||||
|
},
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, false, err
|
||||||
|
}
|
||||||
|
return svg, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
if len(series) != 3 {
|
||||||
|
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
|
||||||
|
}
|
||||||
|
const (
|
||||||
|
width = 1400
|
||||||
|
height = 840
|
||||||
|
plotLeft = 180
|
||||||
|
plotRight = 1220
|
||||||
|
plotTop = 96
|
||||||
|
plotBottom = 660
|
||||||
|
)
|
||||||
|
const (
|
||||||
|
leftOuterAxis = 72
|
||||||
|
leftInnerAxis = 132
|
||||||
|
rightInnerAxis = 1268
|
||||||
|
)
|
||||||
|
layout := chartLayout{
|
||||||
|
Width: width,
|
||||||
|
Height: height,
|
||||||
|
PlotLeft: plotLeft,
|
||||||
|
PlotRight: plotRight,
|
||||||
|
PlotTop: plotTop,
|
||||||
|
PlotBottom: plotBottom,
|
||||||
|
}
|
||||||
|
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{time.Time{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range series {
|
||||||
|
if len(series[i].Values) == 0 {
|
||||||
|
series[i].Values = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Downsample to at most ~1400 points before building SVG.
|
||||||
|
{
|
||||||
|
datasets := make([][]float64, len(series))
|
||||||
|
for i := range series {
|
||||||
|
datasets[i] = series[i].Values
|
||||||
|
}
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
for i := range series {
|
||||||
|
series[i].Values = datasets[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
scales := make([]chartScale, len(series))
|
||||||
|
for i := range series {
|
||||||
|
min, max := chartSeriesBounds(series[i].Values)
|
||||||
|
ticks := chartNiceTicks(min, max, 8)
|
||||||
|
scales[i] = chartScale{
|
||||||
|
Min: ticks[0],
|
||||||
|
Max: ticks[len(ticks)-1],
|
||||||
|
Ticks: ticks,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, width, height)
|
||||||
|
writeChartFrame(&b, title, "", width, height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scales[0])
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
|
||||||
|
for i, axisLineX := range axisX {
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
|
||||||
|
for _, tick := range scales[i].Ticks {
|
||||||
|
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
|
||||||
|
label := sanitizeChartText(chartYAxisNumber(tick))
|
||||||
|
if i < 2 {
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, y, axisLineX+6, y, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX-8, y, series[i].Color, label)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, y, axisLineX-6, y, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX+8, y, series[i].Color, label)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
for i := range series {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
|
||||||
|
}
|
||||||
|
writeLegend(&b, layout, series)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
times := sampleTimes(samples)
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
|
||||||
|
}
|
||||||
|
|
||||||
|
func snapshotTaskHistory() []Task {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
out := make([]Task, len(globalQueue.tasks))
|
||||||
|
for i, t := range globalQueue.tasks {
|
||||||
|
out[i] = *t
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
start, end = end, start
|
||||||
|
}
|
||||||
|
type interval struct {
|
||||||
|
start time.Time
|
||||||
|
end time.Time
|
||||||
|
}
|
||||||
|
active := make([]interval, 0, len(tasks))
|
||||||
|
for _, task := range tasks {
|
||||||
|
if task.StartedAt == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
intervalStart := task.StartedAt.UTC()
|
||||||
|
intervalEnd := now.UTC()
|
||||||
|
if task.DoneAt != nil {
|
||||||
|
intervalEnd = task.DoneAt.UTC()
|
||||||
|
}
|
||||||
|
if !intervalEnd.After(intervalStart) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if intervalEnd.Before(start) || intervalStart.After(end) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if intervalStart.Before(start) {
|
||||||
|
intervalStart = start
|
||||||
|
}
|
||||||
|
if intervalEnd.After(end) {
|
||||||
|
intervalEnd = end
|
||||||
|
}
|
||||||
|
active = append(active, interval{start: intervalStart, end: intervalEnd})
|
||||||
|
}
|
||||||
|
sort.Slice(active, func(i, j int) bool {
|
||||||
|
if active[i].start.Equal(active[j].start) {
|
||||||
|
return active[i].end.Before(active[j].end)
|
||||||
|
}
|
||||||
|
return active[i].start.Before(active[j].start)
|
||||||
|
})
|
||||||
|
merged := make([]interval, 0, len(active))
|
||||||
|
for _, span := range active {
|
||||||
|
if len(merged) == 0 {
|
||||||
|
merged = append(merged, span)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
last := &merged[len(merged)-1]
|
||||||
|
if !span.start.After(last.end) {
|
||||||
|
if span.end.After(last.end) {
|
||||||
|
last.end = span.end
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
merged = append(merged, span)
|
||||||
|
}
|
||||||
|
|
||||||
|
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
|
||||||
|
cursor := start
|
||||||
|
for _, span := range merged {
|
||||||
|
if span.start.After(cursor) {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
|
||||||
|
}
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
|
||||||
|
cursor = span.end
|
||||||
|
}
|
||||||
|
if cursor.Before(end) {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
|
||||||
|
}
|
||||||
|
if len(segments) == 0 {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
|
||||||
|
}
|
||||||
|
return segments
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
|
||||||
|
times := make([]time.Time, 0, len(samples))
|
||||||
|
for _, sample := range samples {
|
||||||
|
times = append(times, sample.Timestamp)
|
||||||
|
}
|
||||||
|
return times
|
||||||
|
}
|
||||||
|
|
||||||
|
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
|
||||||
|
min, max := 0.0, 1.0
|
||||||
|
if yMin != nil && yMax != nil {
|
||||||
|
min, max = *yMin, *yMax
|
||||||
|
} else {
|
||||||
|
min, max = chartSeriesBounds(flattenDatasets(datasets))
|
||||||
|
if yMin != nil {
|
||||||
|
min = *yMin
|
||||||
|
}
|
||||||
|
if yMax != nil {
|
||||||
|
max = *yMax
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ticks := chartNiceTicks(min, max, 8)
|
||||||
|
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
|
||||||
|
}
|
||||||
|
|
||||||
|
func flattenDatasets(datasets [][]float64) []float64 {
|
||||||
|
total := 0
|
||||||
|
for _, ds := range datasets {
|
||||||
|
total += len(ds)
|
||||||
|
}
|
||||||
|
out := make([]float64, 0, total)
|
||||||
|
for _, ds := range datasets {
|
||||||
|
out = append(out, ds...)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
|
||||||
|
legendRows := 0
|
||||||
|
if chartLegendVisible(seriesCount) && seriesCount > 0 {
|
||||||
|
cols := 4
|
||||||
|
if seriesCount < cols {
|
||||||
|
cols = seriesCount
|
||||||
|
}
|
||||||
|
legendRows = (seriesCount + cols - 1) / cols
|
||||||
|
}
|
||||||
|
legendHeight := 0
|
||||||
|
if legendRows > 0 {
|
||||||
|
legendHeight = legendRows*24 + 24
|
||||||
|
}
|
||||||
|
return chartLayout{
|
||||||
|
Width: 1400,
|
||||||
|
Height: canvasHeight,
|
||||||
|
PlotLeft: 96,
|
||||||
|
PlotRight: 1352,
|
||||||
|
PlotTop: 72,
|
||||||
|
PlotBottom: canvasHeight - 60 - legendHeight,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
|
||||||
|
if len(times) == 0 {
|
||||||
|
return time.Time{}, time.Time{}
|
||||||
|
}
|
||||||
|
start := times[0].UTC()
|
||||||
|
end := start
|
||||||
|
for _, ts := range times[1:] {
|
||||||
|
t := ts.UTC()
|
||||||
|
if t.Before(start) {
|
||||||
|
start = t
|
||||||
|
}
|
||||||
|
if t.After(end) {
|
||||||
|
end = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return start, end
|
||||||
|
}
|
||||||
|
|
||||||
|
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||||
|
if count <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if len(times) == count {
|
||||||
|
return times
|
||||||
|
}
|
||||||
|
if len(times) == 1 {
|
||||||
|
out := make([]time.Time, count)
|
||||||
|
for i := range out {
|
||||||
|
out[i] = times[0].Add(time.Duration(i) * time.Minute)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
|
||||||
|
out := make([]time.Time, count)
|
||||||
|
for i := range out {
|
||||||
|
out[i] = base.Add(time.Duration(i) * time.Minute)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderStackedMetricChartSVG renders a stacked area chart where each dataset
|
||||||
|
// is visually "stacked" on top of the previous one. Intended for multi-PSU
|
||||||
|
// power charts where the filled area of each PSU shows its individual
|
||||||
|
// contribution and the total height equals the combined draw.
|
||||||
|
func renderStackedMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range datasets {
|
||||||
|
if len(datasets[i]) == 0 {
|
||||||
|
datasets[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||||
|
pointCount = len(times)
|
||||||
|
|
||||||
|
// Build cumulative sums per time point.
|
||||||
|
cumulative := make([][]float64, len(datasets)+1)
|
||||||
|
for i := range cumulative {
|
||||||
|
cumulative[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
for i, ds := range datasets {
|
||||||
|
for j, v := range ds {
|
||||||
|
cumulative[i+1][j] = cumulative[i][j] + v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale is based on the total (top cumulative row).
|
||||||
|
total := cumulative[len(cumulative)-1]
|
||||||
|
yMin := floatPtr(0)
|
||||||
|
if yMax == nil {
|
||||||
|
yMax = autoMax120(total)
|
||||||
|
}
|
||||||
|
scale := singleAxisChartScale([][]float64{total}, yMin, yMax)
|
||||||
|
|
||||||
|
legendItems := make([]metricChartSeries, len(datasets))
|
||||||
|
for i, name := range names {
|
||||||
|
color := metricChartPalette[i%len(metricChartPalette)]
|
||||||
|
legendItems[i] = metricChartSeries{Name: name, Color: color, Values: datasets[i]}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stats label from totals.
|
||||||
|
statsLabel := chartStatsLabel([][]float64{total})
|
||||||
|
|
||||||
|
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
writeSingleAxisY(&b, layout, scale)
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
|
||||||
|
// Draw stacked areas from top to bottom so lower layers are visible.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeStackedArea(&b, layout, times, start, end, cumulative[i], cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
// Draw border polylines on top.
|
||||||
|
for i := len(datasets) - 1; i >= 0; i-- {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, cumulative[i+1], scale, legendItems[i].Color)
|
||||||
|
}
|
||||||
|
|
||||||
|
writeLegend(&b, layout, legendItems)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeStackedArea draws a filled polygon between two cumulative value arrays
|
||||||
|
// (baseline and top), using the given color at 55% opacity.
|
||||||
|
func writeStackedArea(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, baseline, top []float64, scale chartScale, color string) {
|
||||||
|
n := len(top)
|
||||||
|
if n == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(baseline) < n {
|
||||||
|
baseline = make([]float64, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward path along top values, then backward along baseline values.
|
||||||
|
var points strings.Builder
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(top[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
if i > 0 {
|
||||||
|
points.WriteByte(' ')
|
||||||
|
}
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
for i := n - 1; i >= 0; i-- {
|
||||||
|
x := chartXForTime(chartPointTime(times, i), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(valueClamp(baseline[i], scale), scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
points.WriteByte(' ')
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<polygon points="%s" fill="%s" fill-opacity="0.55" stroke="none"/>`+"\n", points.String(), color)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||||
|
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSVGClose(b *strings.Builder) {
|
||||||
|
b.WriteString("</svg>\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||||
|
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||||
|
width/2, sanitizeChartText(title))
|
||||||
|
if strings.TrimSpace(subtitle) != "" {
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||||
|
width/2, sanitizeChartText(subtitle))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||||
|
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||||
|
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
|
||||||
|
for _, tick := range scale.Ticks {
|
||||||
|
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||||
|
layout.PlotLeft, y, layout.PlotRight, y)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
|
||||||
|
if pointCount <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
|
||||||
|
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||||
|
ts := chartPointTime(times, idx)
|
||||||
|
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||||
|
x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
|
||||||
|
for _, tick := range scale.Ticks {
|
||||||
|
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, y, layout.PlotLeft-6, y)
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
|
||||||
|
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
|
||||||
|
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||||
|
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
label := ""
|
||||||
|
if idx < len(labels) {
|
||||||
|
label = labels[idx]
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
|
||||||
|
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var points strings.Builder
|
||||||
|
for idx, value := range values {
|
||||||
|
if idx > 0 {
|
||||||
|
points.WriteByte(' ')
|
||||||
|
}
|
||||||
|
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
|
||||||
|
points.String(), color)
|
||||||
|
if len(values) == 1 {
|
||||||
|
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
peakIdx := 0
|
||||||
|
peakValue := values[0]
|
||||||
|
for idx, value := range values[1:] {
|
||||||
|
if value >= peakValue {
|
||||||
|
peakIdx = idx + 1
|
||||||
|
peakValue = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||||
|
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||||
|
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||||
|
if !chartLegendVisible(len(series)) || len(series) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cols := 4
|
||||||
|
if len(series) < cols {
|
||||||
|
cols = len(series)
|
||||||
|
}
|
||||||
|
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
|
||||||
|
baseY := layout.PlotBottom + 74
|
||||||
|
for i, item := range series {
|
||||||
|
row := i / cols
|
||||||
|
col := i % cols
|
||||||
|
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
|
||||||
|
y := float64(baseY + row*24)
|
||||||
|
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
|
||||||
|
x, y, x+28, y, item.Color)
|
||||||
|
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
|
||||||
|
x+38, y+4, sanitizeChartText(item.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||||
|
if len(segments) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
|
||||||
|
for _, segment := range segments {
|
||||||
|
if segment.Active || !segment.End.After(segment.Start) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
|
||||||
|
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||||
|
if len(segments) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
seen := map[int]bool{}
|
||||||
|
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
|
||||||
|
for i, segment := range segments {
|
||||||
|
if i > 0 {
|
||||||
|
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||||
|
if !seen[x] {
|
||||||
|
seen[x] = true
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if i < len(segments)-1 {
|
||||||
|
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||||
|
if !seen[x] {
|
||||||
|
seen[x] = true
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// downsampleTimeSeries reduces the time series to at most maxPts points using
|
||||||
|
// min-max bucketing. Each bucket contributes the index of its min and max value
|
||||||
|
// (using the first full-length dataset as the reference series). All parallel
|
||||||
|
// datasets are sampled at those same indices so all series stay aligned.
|
||||||
|
// If len(times) <= maxPts the inputs are returned unchanged.
|
||||||
|
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
|
||||||
|
n := len(times)
|
||||||
|
if n <= maxPts || maxPts <= 0 {
|
||||||
|
return times, datasets
|
||||||
|
}
|
||||||
|
buckets := maxPts / 2
|
||||||
|
if buckets < 1 {
|
||||||
|
buckets = 1
|
||||||
|
}
|
||||||
|
// Use the first dataset that has the same length as times as the reference
|
||||||
|
// for deciding which two indices to keep per bucket.
|
||||||
|
var ref []float64
|
||||||
|
for _, ds := range datasets {
|
||||||
|
if len(ds) == n {
|
||||||
|
ref = ds
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
selected := make([]int, 0, maxPts)
|
||||||
|
bucketSize := float64(n) / float64(buckets)
|
||||||
|
for b := 0; b < buckets; b++ {
|
||||||
|
lo := int(math.Round(float64(b) * bucketSize))
|
||||||
|
hi := int(math.Round(float64(b+1) * bucketSize))
|
||||||
|
if hi > n {
|
||||||
|
hi = n
|
||||||
|
}
|
||||||
|
if lo >= hi {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if ref == nil {
|
||||||
|
selected = append(selected, lo)
|
||||||
|
if hi-1 != lo {
|
||||||
|
selected = append(selected, hi-1)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
minIdx, maxIdx := lo, lo
|
||||||
|
for i := lo + 1; i < hi; i++ {
|
||||||
|
if ref[i] < ref[minIdx] {
|
||||||
|
minIdx = i
|
||||||
|
}
|
||||||
|
if ref[i] > ref[maxIdx] {
|
||||||
|
maxIdx = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if minIdx <= maxIdx {
|
||||||
|
selected = append(selected, minIdx)
|
||||||
|
if maxIdx != minIdx {
|
||||||
|
selected = append(selected, maxIdx)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
selected = append(selected, maxIdx)
|
||||||
|
if minIdx != maxIdx {
|
||||||
|
selected = append(selected, minIdx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outTimes := make([]time.Time, len(selected))
|
||||||
|
for i, idx := range selected {
|
||||||
|
outTimes[i] = times[idx]
|
||||||
|
}
|
||||||
|
outDatasets := make([][]float64, len(datasets))
|
||||||
|
for d, ds := range datasets {
|
||||||
|
if len(ds) != n {
|
||||||
|
outDatasets[d] = ds
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out := make([]float64, len(selected))
|
||||||
|
for i, idx := range selected {
|
||||||
|
out[i] = ds[idx]
|
||||||
|
}
|
||||||
|
outDatasets[d] = out
|
||||||
|
}
|
||||||
|
return outTimes, outDatasets
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||||
|
if !end.After(start) {
|
||||||
|
return float64(left+right) / 2
|
||||||
|
}
|
||||||
|
if ts.Before(start) {
|
||||||
|
ts = start
|
||||||
|
}
|
||||||
|
if ts.After(end) {
|
||||||
|
ts = end
|
||||||
|
}
|
||||||
|
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
|
||||||
|
return float64(left) + ratio*float64(right-left)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartPointTime(times []time.Time, idx int) time.Time {
|
||||||
|
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
|
||||||
|
return times[idx].UTC()
|
||||||
|
}
|
||||||
|
if len(times) > 0 && !times[0].IsZero() {
|
||||||
|
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
|
||||||
|
}
|
||||||
|
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
|
||||||
|
if scale.Max <= scale.Min {
|
||||||
|
return float64(plotTop+plotBottom) / 2
|
||||||
|
}
|
||||||
|
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartSeriesBounds(values []float64) (float64, float64) {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
min, max := values[0], values[0]
|
||||||
|
for _, value := range values[1:] {
|
||||||
|
if value < min {
|
||||||
|
min = value
|
||||||
|
}
|
||||||
|
if value > max {
|
||||||
|
max = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if min == max {
|
||||||
|
if max == 0 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
pad := math.Abs(max) * 0.1
|
||||||
|
if pad == 0 {
|
||||||
|
pad = 1
|
||||||
|
}
|
||||||
|
min -= pad
|
||||||
|
max += pad
|
||||||
|
}
|
||||||
|
if min > 0 {
|
||||||
|
pad := (max - min) * 0.2
|
||||||
|
if pad == 0 {
|
||||||
|
pad = max * 0.1
|
||||||
|
}
|
||||||
|
min -= pad
|
||||||
|
if min < 0 {
|
||||||
|
min = 0
|
||||||
|
}
|
||||||
|
max += pad
|
||||||
|
}
|
||||||
|
return min, max
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartNiceTicks(min, max float64, target int) []float64 {
|
||||||
|
if min == max {
|
||||||
|
max = min + 1
|
||||||
|
}
|
||||||
|
span := max - min
|
||||||
|
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
|
||||||
|
for _, factor := range []float64{1, 2, 5, 10} {
|
||||||
|
if span/(factor*step) <= float64(target)*1.5 {
|
||||||
|
step = factor * step
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
low := math.Floor(min/step) * step
|
||||||
|
high := math.Ceil(max/step) * step
|
||||||
|
var ticks []float64
|
||||||
|
for value := low; value <= high+step*0.001; value += step {
|
||||||
|
ticks = append(ticks, math.Round(value*1e9)/1e9)
|
||||||
|
}
|
||||||
|
return ticks
|
||||||
|
}
|
||||||
|
|
||||||
|
func valueClamp(value float64, scale chartScale) float64 {
|
||||||
|
if value < scale.Min {
|
||||||
|
return scale.Min
|
||||||
|
}
|
||||||
|
if value > scale.Max {
|
||||||
|
return scale.Max
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartStatsLabel(datasets [][]float64) string {
|
||||||
|
mn, avg, mx := globalStats(datasets)
|
||||||
|
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("min %s avg %s max %s",
|
||||||
|
chartLegendNumber(mn),
|
||||||
|
chartLegendNumber(avg),
|
||||||
|
chartLegendNumber(mx),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDisplayLabel(idx int) string {
|
||||||
|
if name := gpuModelNameByIndex(idx); name != "" {
|
||||||
|
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("GPU %d", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuModelNameByIndex(idx int) string {
|
||||||
|
now := time.Now()
|
||||||
|
gpuLabelCache.mu.Lock()
|
||||||
|
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||||
|
gpuLabelCache.loadedAt = now
|
||||||
|
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||||
|
gpuLabelCache.mu.Unlock()
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadGPUModelNames() map[int]string {
|
||||||
|
out := map[int]string{}
|
||||||
|
gpus, err := platform.New().ListNvidiaGPUs()
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
name := strings.TrimSpace(gpu.Name)
|
||||||
|
if name != "" {
|
||||||
|
out[gpu.Index] = name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -9,13 +12,33 @@ import (
|
|||||||
|
|
||||||
// jobState holds the output lines and completion status of an async job.
|
// jobState holds the output lines and completion status of an async job.
|
||||||
type jobState struct {
|
type jobState struct {
|
||||||
lines []string
|
lines []string
|
||||||
done bool
|
done bool
|
||||||
err string
|
err string
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
subs []chan string
|
subs []chan string
|
||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
logPath string
|
logPath string
|
||||||
|
serialPrefix string
|
||||||
|
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||||
|
logBuf *bufio.Writer
|
||||||
|
}
|
||||||
|
|
||||||
|
// readTaskLogFile reads a task log, refusing files over 50 MB.
|
||||||
|
func readTaskLogFile(path string) ([]byte, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
data, err := io.ReadAll(io.LimitReader(f, 50<<20+1))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if int64(len(data)) > 50<<20 {
|
||||||
|
return nil, fmt.Errorf("task log %s too large (exceeds 50 MB)", path)
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// abort cancels the job if it has a cancel function and is not yet done.
|
// abort cancels the job if it has a cancel function and is not yet done.
|
||||||
@@ -30,11 +53,22 @@ func (j *jobState) abort() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (j *jobState) append(line string) {
|
func (j *jobState) append(line string) {
|
||||||
|
j.appendWithOptions(line, true, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) appendFromLog(line string) {
|
||||||
|
j.appendWithOptions(line, false, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
|
||||||
j.mu.Lock()
|
j.mu.Lock()
|
||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
j.lines = append(j.lines, line)
|
j.lines = append(j.lines, line)
|
||||||
if j.logPath != "" {
|
if persistLog && j.logPath != "" {
|
||||||
appendJobLog(j.logPath, line)
|
j.writeLogLineLocked(line)
|
||||||
|
}
|
||||||
|
if serialMirror && j.serialPrefix != "" {
|
||||||
|
taskSerialWriteLine(j.serialPrefix + line)
|
||||||
}
|
}
|
||||||
for _, ch := range j.subs {
|
for _, ch := range j.subs {
|
||||||
select {
|
select {
|
||||||
@@ -44,6 +78,35 @@ func (j *jobState) append(line string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// writeLogLineLocked writes a line to the persistent log file, opening it lazily.
|
||||||
|
// Must be called with j.mu held. Uses a buffered writer kept open for the task
|
||||||
|
// lifetime — avoids thousands of open/close syscalls during high-frequency logs.
|
||||||
|
func (j *jobState) writeLogLineLocked(line string) {
|
||||||
|
if j.logFile == nil {
|
||||||
|
f, err := os.OpenFile(j.logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
j.logFile = f
|
||||||
|
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||||
|
}
|
||||||
|
_, _ = j.logBuf.WriteString(line + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||||
|
func (j *jobState) closeLog() {
|
||||||
|
j.mu.Lock()
|
||||||
|
defer j.mu.Unlock()
|
||||||
|
if j.logBuf != nil {
|
||||||
|
_ = j.logBuf.Flush()
|
||||||
|
}
|
||||||
|
if j.logFile != nil {
|
||||||
|
_ = j.logFile.Close()
|
||||||
|
j.logFile = nil
|
||||||
|
j.logBuf = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (j *jobState) finish(errMsg string) {
|
func (j *jobState) finish(errMsg string) {
|
||||||
j.mu.Lock()
|
j.mu.Lock()
|
||||||
defer j.mu.Unlock()
|
defer j.mu.Unlock()
|
||||||
@@ -84,12 +147,12 @@ func (m *jobManager) create(id string) *jobState {
|
|||||||
j := &jobState{}
|
j := &jobState{}
|
||||||
m.jobs[id] = j
|
m.jobs[id] = j
|
||||||
// Schedule cleanup after 30 minutes
|
// Schedule cleanup after 30 minutes
|
||||||
go func() {
|
goRecoverOnce("job cleanup", func() {
|
||||||
time.Sleep(30 * time.Minute)
|
time.Sleep(30 * time.Minute)
|
||||||
m.mu.Lock()
|
m.mu.Lock()
|
||||||
delete(m.jobs, id)
|
delete(m.jobs, id)
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
}()
|
})
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,12 +170,15 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
|||||||
return j, ok
|
return j, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func newTaskJobState(logPath string) *jobState {
|
func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||||
j := &jobState{logPath: logPath}
|
j := &jobState{logPath: logPath}
|
||||||
|
if len(serialPrefix) > 0 {
|
||||||
|
j.serialPrefix = serialPrefix[0]
|
||||||
|
}
|
||||||
if logPath == "" {
|
if logPath == "" {
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
data, err := os.ReadFile(logPath)
|
data, err := readTaskLogFile(logPath)
|
||||||
if err != nil || len(data) == 0 {
|
if err != nil || len(data) == 0 {
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,10 +17,10 @@ import (
|
|||||||
// It supports multiple concurrent SAT tasks: a shared event window is open
|
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||||
// while any SAT task is running, and flushed when all tasks complete.
|
// while any SAT task is running, and flushed when all tasks complete.
|
||||||
type kmsgWatcher struct {
|
type kmsgWatcher struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
activeCount int // number of in-flight SAT tasks
|
activeCount int // number of in-flight SAT tasks
|
||||||
window *kmsgWindow
|
window *kmsgWindow
|
||||||
statusDB *app.ComponentStatusDB
|
statusDB *app.ComponentStatusDB
|
||||||
}
|
}
|
||||||
|
|
||||||
type kmsgWindow struct {
|
type kmsgWindow struct {
|
||||||
@@ -48,36 +48,39 @@ func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
|||||||
|
|
||||||
// start launches the background kmsg reading goroutine.
|
// start launches the background kmsg reading goroutine.
|
||||||
func (w *kmsgWatcher) start() {
|
func (w *kmsgWatcher) start() {
|
||||||
go w.run()
|
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *kmsgWatcher) run() {
|
func (w *kmsgWatcher) run() {
|
||||||
f, err := os.Open("/dev/kmsg")
|
for {
|
||||||
if err != nil {
|
f, err := os.Open("/dev/kmsg")
|
||||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
if err != nil {
|
||||||
return
|
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||||
}
|
time.Sleep(30 * time.Second)
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
// Best-effort seek to end so we only capture events from now forward.
|
|
||||||
_, _ = f.Seek(0, io.SeekEnd)
|
|
||||||
|
|
||||||
scanner := bufio.NewScanner(f)
|
|
||||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
|
||||||
for scanner.Scan() {
|
|
||||||
line := scanner.Text()
|
|
||||||
evt, ok := parseKmsgLine(line)
|
|
||||||
if !ok {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
w.mu.Lock()
|
// Best-effort seek to end so we only capture events from now forward.
|
||||||
if w.window != nil {
|
_, _ = f.Seek(0, io.SeekEnd)
|
||||||
w.recordEvent(evt)
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
evt, ok := parseKmsgLine(line)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.window != nil {
|
||||||
|
w.recordEvent(evt)
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
}
|
}
|
||||||
w.mu.Unlock()
|
if err := scanner.Err(); err != nil {
|
||||||
}
|
slog.Warn("kmsg watcher stopped", "err", err)
|
||||||
if err := scanner.Err(); err != nil {
|
}
|
||||||
slog.Warn("kmsg watcher stopped", "err", err)
|
_ = f.Close()
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,7 +137,7 @@ func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
|||||||
if window == nil || len(window.events) == 0 {
|
if window == nil || len(window.events) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
go w.flushWindow(window)
|
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||||
@@ -229,7 +232,8 @@ func truncate(s string, max int) string {
|
|||||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||||
func isSATTarget(target string) bool {
|
func isSATTarget(target string) bool {
|
||||||
switch target {
|
switch target {
|
||||||
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||||
"platform-stress":
|
"platform-stress":
|
||||||
return true
|
return true
|
||||||
|
|||||||
137
audit/internal/webui/layout.go
Normal file
137
audit/internal/webui/layout.go
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func layoutHead(title string) string {
|
||||||
|
return `<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||||
|
<title>` + html.EscapeString(title) + `</title>
|
||||||
|
<style>
|
||||||
|
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||||
|
*{box-sizing:border-box;margin:0;padding:0}
|
||||||
|
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||||
|
a{color:var(--accent);text-decoration:none}
|
||||||
|
/* Sidebar */
|
||||||
|
.sidebar{width:210px;min-height:100vh;background:#1b1c1d;flex-shrink:0;display:flex;flex-direction:column}
|
||||||
|
.sidebar-logo{padding:18px 16px 12px;font-size:18px;font-weight:700;color:#fff;letter-spacing:-.5px}
|
||||||
|
.sidebar-logo span{color:rgba(255,255,255,.5);font-weight:400;font-size:12px;display:block;margin-top:2px}
|
||||||
|
.sidebar-version{padding:0 16px 14px;font-size:11px;color:rgba(255,255,255,.45)}
|
||||||
|
.sidebar-badge{margin:0 12px 12px;padding:5px 8px;border-radius:4px;font-size:11px;font-weight:600;text-align:center}
|
||||||
|
.sidebar-badge-warn{background:#7a4f00;color:#f6c90e}
|
||||||
|
.sidebar-badge-crit{background:#5c1a1a;color:#ff6b6b}
|
||||||
|
.nav{flex:1}
|
||||||
|
.nav-item{display:block;padding:10px 16px;color:rgba(255,255,255,.7);font-size:13px;border-left:3px solid transparent;transition:all .15s}
|
||||||
|
.nav-item:hover{color:#fff;background:rgba(255,255,255,.08)}
|
||||||
|
.nav-item.active{color:#fff;background:rgba(33,133,208,.25);border-left-color:var(--accent)}
|
||||||
|
/* Content */
|
||||||
|
.main{flex:1;display:flex;flex-direction:column;overflow:auto}
|
||||||
|
.topbar{padding:13px 24px;background:#1b1c1d;display:flex;align-items:center;gap:12px}
|
||||||
|
.topbar h1{font-size:16px;font-weight:700;color:rgba(255,255,255,.9)}
|
||||||
|
.content{padding:24px;flex:1}
|
||||||
|
/* Cards */
|
||||||
|
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);margin-bottom:16px;overflow:hidden}
|
||||||
|
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px;display:flex;align-items:center;gap:8px}
|
||||||
|
.card-head-actions{justify-content:space-between}
|
||||||
|
.card-head-buttons{display:flex;align-items:center;gap:8px;margin-left:auto;flex-wrap:wrap}
|
||||||
|
.card-body{padding:16px}
|
||||||
|
/* Buttons */
|
||||||
|
.btn{display:inline-flex;align-items:center;gap:6px;padding:8px 16px;border-radius:4px;font-size:13px;font-weight:700;cursor:pointer;border:none;transition:background .1s;font-family:inherit}
|
||||||
|
.btn-primary{background:var(--accent);color:#fff}.btn-primary:hover{background:var(--accent-dark)}
|
||||||
|
.btn-danger{background:#db2828;color:#fff}.btn-danger:hover{background:#b91c1c}
|
||||||
|
.btn-secondary{background:var(--surface-2);color:var(--ink);border:1px solid var(--border)}.btn-secondary:hover{background:#eee}
|
||||||
|
.btn-sm{padding:5px 10px;font-size:12px}
|
||||||
|
/* Tables */
|
||||||
|
table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface)}
|
||||||
|
th{text-align:left;padding:9px 14px;color:var(--ink);font-weight:700;background:var(--surface-2);border-bottom:1px solid var(--border-lite)}
|
||||||
|
td{padding:9px 14px;border-top:1px solid var(--border-lite)}
|
||||||
|
tr:first-child td{border-top:0}
|
||||||
|
tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||||
|
/* Status badges */
|
||||||
|
.badge{display:inline-block;padding:2px 9px;border-radius:4px;font-size:11px;font-weight:700}
|
||||||
|
.badge-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||||
|
.badge-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||||
|
.badge-err{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
|
.badge-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
|
/* Component chips — one small square per device */
|
||||||
|
.chips{display:inline-flex;flex-wrap:wrap;gap:3px;align-items:center;vertical-align:middle}
|
||||||
|
.chip{display:inline-flex;align-items:center;justify-content:center;width:20px;height:20px;border-radius:3px;font-size:10px;font-weight:800;cursor:default;font-family:monospace;letter-spacing:0;user-select:none}
|
||||||
|
.chip-ok{background:var(--ok-bg);color:var(--ok-fg);border:1px solid #a3c293}
|
||||||
|
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||||
|
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
|
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
|
/* Output terminal */
|
||||||
|
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||||
|
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||||
|
/* Forms */
|
||||||
|
.form-row{margin-bottom:14px}
|
||||||
|
.form-row label{display:block;font-size:12px;color:var(--muted);margin-bottom:5px;font-weight:700}
|
||||||
|
.form-row input,.form-row select{width:100%;padding:8px 10px;background:var(--surface);border:1px solid var(--border);border-radius:4px;color:var(--ink);font-size:13px;outline:none;font-family:inherit}
|
||||||
|
.form-row input:focus,.form-row select:focus{border-color:var(--accent);box-shadow:0 0 0 2px rgba(33,133,208,.2)}
|
||||||
|
/* Grid */
|
||||||
|
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:16px}
|
||||||
|
.grid3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:16px}
|
||||||
|
@media(max-width:900px){.grid2,.grid3{grid-template-columns:1fr}.card-head-actions{align-items:flex-start;flex-direction:column}.card-head-buttons{margin-left:0}}
|
||||||
|
/* iframe viewer */
|
||||||
|
.viewer-frame{width:100%;height:calc(100vh - 160px);border:0;border-radius:4px;background:var(--surface-2)}
|
||||||
|
/* Alerts */
|
||||||
|
.alert{padding:10px 14px;border-radius:4px;font-size:13px;margin-bottom:14px}
|
||||||
|
.alert-info{background:#dff0ff;border:1px solid #a9d4f5;color:#1e3a5f}
|
||||||
|
.alert-warn{background:var(--warn-bg);border:1px solid #c9ba9b;color:var(--warn-fg)}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
func layoutNav(active string, buildLabel string) string {
|
||||||
|
items := []struct{ id, label, href, onclick string }{
|
||||||
|
{"dashboard", "Dashboard", "/", ""},
|
||||||
|
{"audit", "Audit", "/audit", ""},
|
||||||
|
{"validate", "Validate", "/validate", ""},
|
||||||
|
{"burn", "Burn", "/burn", ""},
|
||||||
|
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||||
|
{"tasks", "Tasks", "/tasks", ""},
|
||||||
|
{"tools", "Tools", "/tools", ""},
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<aside class="sidebar">`)
|
||||||
|
b.WriteString(`<div class="sidebar-logo">bee<span>hardware audit</span></div>`)
|
||||||
|
if strings.TrimSpace(buildLabel) == "" {
|
||||||
|
buildLabel = "dev"
|
||||||
|
}
|
||||||
|
b.WriteString(`<div class="sidebar-version">Version ` + html.EscapeString(buildLabel) + `</div>`)
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
gspMode := strings.TrimSpace(string(raw))
|
||||||
|
switch gspMode {
|
||||||
|
case "gsp-off":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-warn">NVIDIA GSP=off</div>`)
|
||||||
|
case "gsp-stuck":
|
||||||
|
b.WriteString(`<div class="sidebar-badge sidebar-badge-crit">NVIDIA GSP stuck — reboot</div>`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`<nav class="nav">`)
|
||||||
|
for _, item := range items {
|
||||||
|
cls := "nav-item"
|
||||||
|
if item.id == active {
|
||||||
|
cls += " active"
|
||||||
|
}
|
||||||
|
if item.onclick != "" {
|
||||||
|
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||||
|
cls, item.href, item.onclick, item.label))
|
||||||
|
} else {
|
||||||
|
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||||
|
cls, item.href, item.label))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`</nav>`)
|
||||||
|
b.WriteString(`</aside>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
@@ -21,6 +22,13 @@ type MetricsDB struct {
|
|||||||
db *sql.DB
|
db *sql.DB
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *MetricsDB) Close() error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return m.db.Close()
|
||||||
|
}
|
||||||
|
|
||||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
@@ -45,6 +53,9 @@ CREATE TABLE IF NOT EXISTS sys_metrics (
|
|||||||
cpu_load_pct REAL,
|
cpu_load_pct REAL,
|
||||||
mem_load_pct REAL,
|
mem_load_pct REAL,
|
||||||
power_w REAL,
|
power_w REAL,
|
||||||
|
power_source TEXT,
|
||||||
|
power_mode TEXT,
|
||||||
|
power_reason TEXT,
|
||||||
PRIMARY KEY (ts)
|
PRIMARY KEY (ts)
|
||||||
);
|
);
|
||||||
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||||
@@ -54,6 +65,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
|
|||||||
usage_pct REAL,
|
usage_pct REAL,
|
||||||
mem_usage_pct REAL,
|
mem_usage_pct REAL,
|
||||||
power_w REAL,
|
power_w REAL,
|
||||||
|
clock_mhz REAL,
|
||||||
|
mem_clock_mhz REAL,
|
||||||
PRIMARY KEY (ts, gpu_index)
|
PRIMARY KEY (ts, gpu_index)
|
||||||
);
|
);
|
||||||
CREATE TABLE IF NOT EXISTS fan_metrics (
|
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||||
@@ -70,6 +83,47 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
|||||||
PRIMARY KEY (ts, name)
|
PRIMARY KEY (ts, name)
|
||||||
);
|
);
|
||||||
`)
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "sys_metrics", "power_source", "TEXT"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "sys_metrics", "power_mode", "TEXT"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return ensureMetricsColumn(db, "sys_metrics", "power_reason", "TEXT")
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||||
|
rows, err := db.Query("PRAGMA table_info(" + table + ")")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var cid int
|
||||||
|
var name, ctype string
|
||||||
|
var notNull, pk int
|
||||||
|
var dflt sql.NullString
|
||||||
|
if err := rows.Scan(&cid, &name, &ctype, ¬Null, &dflt, &pk); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if strings.EqualFold(name, column) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,16 +137,16 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
defer func() { _ = tx.Rollback() }()
|
defer func() { _ = tx.Rollback() }()
|
||||||
|
|
||||||
_, err = tx.Exec(
|
_, err = tx.Exec(
|
||||||
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w) VALUES(?,?,?,?)`,
|
`INSERT OR REPLACE INTO sys_metrics(ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason) VALUES(?,?,?,?,?,?,?)`,
|
||||||
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW,
|
ts, s.CPULoadPct, s.MemLoadPct, s.PowerW, s.PowerSource, s.PowerMode, s.PowerReason,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for _, g := range s.GPUs {
|
for _, g := range s.GPUs {
|
||||||
_, err = tx.Exec(
|
_, err = tx.Exec(
|
||||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
|
||||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -119,14 +173,81 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
return tx.Commit()
|
return tx.Commit()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||||
|
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||||
|
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||||
|
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||||
|
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||||
|
// the overall shape of every chart.
|
||||||
|
//
|
||||||
|
// Called hourly by the metrics collector background goroutine.
|
||||||
|
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
start := deleteOlderThan.Unix()
|
||||||
|
end := downsampleBefore.Unix()
|
||||||
|
if end <= start {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||||
|
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||||
|
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||||
|
_, err := m.db.Exec(`
|
||||||
|
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||||
|
AND ts NOT IN (
|
||||||
|
SELECT MIN(ts) FROM `+table+`
|
||||||
|
WHERE ts >= ? AND ts < ?
|
||||||
|
GROUP BY ts / 60
|
||||||
|
)`, start, end, start, end)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||||
|
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||||
|
func (m *MetricsDB) Prune(before time.Time) error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
cutTS := before.Unix()
|
||||||
|
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||||
|
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w,power_source,power_mode,power_reason FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics ORDER BY ts`, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadBetween returns samples in chronological order within the given time window.
|
||||||
|
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||||
|
if m == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
start, end = end, start
|
||||||
|
}
|
||||||
|
return m.loadSamples(
|
||||||
|
`SELECT ts,cpu_load_pct,mem_load_pct,power_w,IFNULL(power_source,''),IFNULL(power_mode,''),IFNULL(power_reason,'') FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||||
|
start.Unix(), end.Unix(),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||||
@@ -140,11 +261,14 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
type sysRow struct {
|
type sysRow struct {
|
||||||
ts int64
|
ts int64
|
||||||
cpu, mem, pwr float64
|
cpu, mem, pwr float64
|
||||||
|
powerSource string
|
||||||
|
powerMode string
|
||||||
|
powerReason string
|
||||||
}
|
}
|
||||||
var sysRows []sysRow
|
var sysRows []sysRow
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var r sysRow
|
var r sysRow
|
||||||
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr); err != nil {
|
if err := rows.Scan(&r.ts, &r.cpu, &r.mem, &r.pwr, &r.powerSource, &r.powerMode, &r.powerReason); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
sysRows = append(sysRows, r)
|
sysRows = append(sysRows, r)
|
||||||
@@ -163,7 +287,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
}
|
}
|
||||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||||
gRows, err := m.db.Query(
|
gRows, err := m.db.Query(
|
||||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||||
minTS, maxTS,
|
minTS, maxTS,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -171,7 +295,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
for gRows.Next() {
|
for gRows.Next() {
|
||||||
var ts int64
|
var ts int64
|
||||||
var g platform.GPUMetricRow
|
var g platform.GPUMetricRow
|
||||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
|
||||||
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -254,10 +378,13 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||||
for i, r := range sysRows {
|
for i, r := range sysRows {
|
||||||
s := platform.LiveMetricSample{
|
s := platform.LiveMetricSample{
|
||||||
Timestamp: time.Unix(r.ts, 0).UTC(),
|
Timestamp: time.Unix(r.ts, 0).UTC(),
|
||||||
CPULoadPct: r.cpu,
|
CPULoadPct: r.cpu,
|
||||||
MemLoadPct: r.mem,
|
MemLoadPct: r.mem,
|
||||||
PowerW: r.pwr,
|
PowerW: r.pwr,
|
||||||
|
PowerSource: r.powerSource,
|
||||||
|
PowerMode: r.powerMode,
|
||||||
|
PowerReason: r.powerReason,
|
||||||
}
|
}
|
||||||
for _, idx := range gpuIndices {
|
for _, idx := range gpuIndices {
|
||||||
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
if g, ok := gpuData[gpuKey{r.ts, idx}]; ok {
|
||||||
@@ -283,7 +410,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||||
rows, err := m.db.Query(`
|
rows, err := m.db.Query(`
|
||||||
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
|
||||||
|
g.clock_mhz, g.mem_clock_mhz
|
||||||
FROM sys_metrics s
|
FROM sys_metrics s
|
||||||
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||||
ORDER BY s.ts, g.gpu_index
|
ORDER BY s.ts, g.gpu_index
|
||||||
@@ -294,13 +422,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
|||||||
defer rows.Close()
|
defer rows.Close()
|
||||||
|
|
||||||
cw := csv.NewWriter(w)
|
cw := csv.NewWriter(w)
|
||||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var ts int64
|
var ts int64
|
||||||
var cpu, mem, pwr float64
|
var cpu, mem, pwr float64
|
||||||
var gpuIdx sql.NullInt64
|
var gpuIdx sql.NullInt64
|
||||||
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
|
||||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
row := []string{
|
row := []string{
|
||||||
@@ -316,9 +444,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
|||||||
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||||
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||||
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
row = append(row, "", "", "", "", "")
|
row = append(row, "", "", "", "", "", "", "")
|
||||||
}
|
}
|
||||||
_ = cw.Write(row)
|
_ = cw.Write(row)
|
||||||
}
|
}
|
||||||
@@ -326,9 +456,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
|||||||
return cw.Error()
|
return cw.Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close closes the database.
|
|
||||||
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
|
||||||
|
|
||||||
func nullFloat(v float64) sql.NullFloat64 {
|
func nullFloat(v float64) sql.NullFloat64 {
|
||||||
return sql.NullFloat64{Float64: v, Valid: true}
|
return sql.NullFloat64{Float64: v, Valid: true}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"database/sql"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
|
_ "modernc.org/sqlite"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||||
@@ -67,3 +69,106 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "metrics.db")
|
||||||
|
raw, err := sql.Open("sqlite", path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("sql.Open: %v", err)
|
||||||
|
}
|
||||||
|
_, err = raw.Exec(`
|
||||||
|
CREATE TABLE gpu_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
gpu_index INTEGER NOT NULL,
|
||||||
|
temp_c REAL,
|
||||||
|
usage_pct REAL,
|
||||||
|
mem_usage_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts, gpu_index)
|
||||||
|
);
|
||||||
|
CREATE TABLE sys_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
cpu_load_pct REAL,
|
||||||
|
mem_load_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts)
|
||||||
|
);
|
||||||
|
CREATE TABLE fan_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
rpm REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
CREATE TABLE temp_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
grp TEXT NOT NULL,
|
||||||
|
celsius REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("create legacy schema: %v", err)
|
||||||
|
}
|
||||||
|
_ = raw.Close()
|
||||||
|
|
||||||
|
db, err := openMetricsDB(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
now := time.Unix(1_700_000_100, 0).UTC()
|
||||||
|
err = db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: now,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
samples, err := db.LoadAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadAll: %v", err)
|
||||||
|
}
|
||||||
|
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
|
||||||
|
t.Fatalf("samples=%+v", samples)
|
||||||
|
}
|
||||||
|
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
|
||||||
|
t.Fatalf("ClockMHz=%v want 1410", got)
|
||||||
|
}
|
||||||
|
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
|
||||||
|
t.Fatalf("MemClockMHz=%v want 2600", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
|
||||||
|
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
base := time.Unix(1_700_000_000, 0).UTC()
|
||||||
|
for i := 0; i < 5; i++ {
|
||||||
|
if err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base.Add(time.Duration(i) * time.Minute),
|
||||||
|
CPULoadPct: float64(i),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("Write(%d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadBetween: %v", err)
|
||||||
|
}
|
||||||
|
if len(got) != 3 {
|
||||||
|
t.Fatalf("LoadBetween len=%d want 3", len(got))
|
||||||
|
}
|
||||||
|
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
|
||||||
|
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
613
audit/internal/webui/page_benchmark.go
Normal file
613
audit/internal/webui/page_benchmark.go
Normal file
@@ -0,0 +1,613 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
type benchmarkHistoryRun struct {
|
||||||
|
generatedAt time.Time
|
||||||
|
displayTime string
|
||||||
|
gpuScores map[int]float64
|
||||||
|
gpuStatuses map[int]string
|
||||||
|
overallStatus string
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmark(opts HandlerOptions) string {
|
||||||
|
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
|
<div class="grid2">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Benchmark Setup</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="form-row">
|
||||||
|
<label>Profile</label>
|
||||||
|
<select id="benchmark-profile">
|
||||||
|
<option value="standard" selected>Standard — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</option>
|
||||||
|
<option value="stability">Stability — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</option>
|
||||||
|
<option value="overnight">Overnight — Perf ` + validateFmtDur(platform.BenchmarkEstimatedPerfOvernightSec) + ` / Power Fit ` + validateFmtDur(platform.BenchmarkEstimatedPowerOvernightSec) + `</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="form-row">
|
||||||
|
<label>GPU Selection</label>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectAll()">Select All</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="benchmarkSelectNone()">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div id="benchmark-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<label class="benchmark-cb-row">
|
||||||
|
<input type="radio" name="benchmark-mode" value="sequential" onchange="benchmarkUpdateSelectionNote()">
|
||||||
|
<span>Sequential — one GPU at a time</span>
|
||||||
|
</label>
|
||||||
|
<label class="benchmark-cb-row" id="benchmark-parallel-label">
|
||||||
|
<input type="radio" name="benchmark-mode" value="parallel" onchange="benchmarkUpdateSelectionNote()">
|
||||||
|
<span>Parallel — all selected GPUs simultaneously</span>
|
||||||
|
</label>
|
||||||
|
<label class="benchmark-cb-row" id="benchmark-ramp-label">
|
||||||
|
<input type="radio" name="benchmark-mode" value="ramp-up" checked onchange="benchmarkUpdateSelectionNote()">
|
||||||
|
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
|
||||||
|
</label>
|
||||||
|
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
|
||||||
|
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>▶ Run Performance Benchmark</button>
|
||||||
|
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>▶ Run Power / Thermal Fit</button>
|
||||||
|
<button id="benchmark-run-autotune-btn" class="btn btn-secondary" onclick="runBenchmarkAutotune()">Autotune</button>
|
||||||
|
</div>
|
||||||
|
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
|
||||||
|
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
|
||||||
|
<div id="benchmark-autotune-status" style="margin-top:10px;font-size:12px;color:var(--muted)">Autotune status: loading…</div>
|
||||||
|
<div style="margin-top:6px;font-size:12px;color:var(--muted)">Autotune overwrites the saved system-power source and applies it to all new power charts and tests.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Method Split</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
|
||||||
|
<table>
|
||||||
|
<tr><th>Run Type</th><th>Engine</th><th>Question</th><th>Standard</th><th>Stability</th></tr>
|
||||||
|
<tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPerfStabilitySec) + `</td></tr>
|
||||||
|
<tr><td>Power / Thermal Fit</td><td><code>dcgmproftester</code> + <code>nvidia-smi -pl</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStandardSec) + `</td><td>` + validateFmtDur(platform.BenchmarkEstimatedPowerStabilitySec) + `</td></tr>
|
||||||
|
</table>
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin-top:10px">Timings are per full ramp-up run (1 GPU → all selected), measured on 4–8 GPU servers. Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
` + `<div id="benchmark-results-section">` + renderBenchmarkResultsCard(opts.ExportDir) + `</div>` + `
|
||||||
|
|
||||||
|
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||||
|
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.benchmark-cb-row { display:flex; align-items:flex-start; gap:8px; cursor:pointer; font-size:13px; }
|
||||||
|
.benchmark-cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
.benchmark-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.benchmark-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let benchmarkES = null;
|
||||||
|
function benchmarkTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
function benchmarkSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||||
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
|
.map(function(el) { return parseInt(el.value, 10); })
|
||||||
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
|
.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
function benchmarkMode() {
|
||||||
|
const el = document.querySelector('input[name="benchmark-mode"]:checked');
|
||||||
|
return el ? el.value : 'sequential';
|
||||||
|
}
|
||||||
|
function benchmarkUpdateSelectionNote() {
|
||||||
|
const selected = benchmarkSelectedGPUIndices();
|
||||||
|
const perfBtn = document.getElementById('benchmark-run-performance-btn');
|
||||||
|
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
|
||||||
|
const note = document.getElementById('benchmark-selection-note');
|
||||||
|
if (!selected.length) {
|
||||||
|
perfBtn.disabled = true;
|
||||||
|
fitBtn.disabled = true;
|
||||||
|
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
perfBtn.disabled = false;
|
||||||
|
fitBtn.disabled = false;
|
||||||
|
const mode = benchmarkMode();
|
||||||
|
if (mode === 'ramp-up') {
|
||||||
|
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses dcgmproftester load with nvidia-smi power-limit search per step.';
|
||||||
|
} else if (mode === 'parallel') {
|
||||||
|
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
|
||||||
|
} else {
|
||||||
|
note.textContent = 'Sequential: each selected GPU benchmarked separately.';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function benchmarkRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('benchmark-gpu-list');
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(function(gpu) {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="benchmark-gpu-row">'
|
||||||
|
+ '<input class="benchmark-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="benchmarkUpdateSelectionNote()">'
|
||||||
|
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||||
|
+ '</label>';
|
||||||
|
}).join('');
|
||||||
|
benchmarkApplyMultiGPUState(gpus.length);
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function benchmarkApplyMultiGPUState(gpuCount) {
|
||||||
|
var multiValues = ['parallel', 'ramp-up'];
|
||||||
|
var radios = document.querySelectorAll('input[name="benchmark-mode"]');
|
||||||
|
radios.forEach(function(el) {
|
||||||
|
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||||
|
if (gpuCount < 2 && isMulti) {
|
||||||
|
el.disabled = true;
|
||||||
|
if (el.checked) {
|
||||||
|
var seq = document.querySelector('input[name="benchmark-mode"][value="sequential"]');
|
||||||
|
if (seq) seq.checked = true;
|
||||||
|
}
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '0.4';
|
||||||
|
} else {
|
||||||
|
el.disabled = false;
|
||||||
|
if (gpuCount >= 2 && el.value === 'ramp-up') el.checked = true;
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function benchmarkLoadGPUs() {
|
||||||
|
const status = document.getElementById('benchmark-run-status');
|
||||||
|
status.textContent = '';
|
||||||
|
fetch('/api/gpu/nvidia').then(function(r) {
|
||||||
|
return r.json().then(function(body) {
|
||||||
|
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||||
|
return body;
|
||||||
|
});
|
||||||
|
}).then(function(gpus) {
|
||||||
|
benchmarkRenderGPUList(gpus);
|
||||||
|
}).catch(function(err) {
|
||||||
|
document.getElementById('benchmark-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function benchmarkSelectAll() {
|
||||||
|
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function benchmarkSelectNone() {
|
||||||
|
document.querySelectorAll('.benchmark-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||||
|
benchmarkUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function runNvidiaBenchmark(kind) {
|
||||||
|
const selected = benchmarkSelectedGPUIndices();
|
||||||
|
const status = document.getElementById('benchmark-run-status');
|
||||||
|
if (!selected.length) {
|
||||||
|
status.textContent = 'Select at least one GPU.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||||
|
const mode = benchmarkMode();
|
||||||
|
const rampUp = mode === 'ramp-up' && selected.length > 1;
|
||||||
|
const parallelGPUs = mode === 'parallel' && kind === 'performance';
|
||||||
|
if (kind === 'power-fit' && mode === 'parallel') {
|
||||||
|
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const body = {
|
||||||
|
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||||
|
gpu_indices: selected,
|
||||||
|
run_nccl: kind === 'performance' && selected.length > 1,
|
||||||
|
parallel_gpus: parallelGPUs,
|
||||||
|
ramp_up: rampUp,
|
||||||
|
display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
|
||||||
|
};
|
||||||
|
document.getElementById('benchmark-output').style.display = 'block';
|
||||||
|
document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
|
||||||
|
const term = document.getElementById('benchmark-terminal');
|
||||||
|
term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
|
||||||
|
status.textContent = 'Queueing...';
|
||||||
|
const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
|
||||||
|
fetch(endpoint, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(body)
|
||||||
|
}).then(function(r) {
|
||||||
|
return r.json().then(function(payload) {
|
||||||
|
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||||
|
return payload;
|
||||||
|
});
|
||||||
|
}).then(function(d) {
|
||||||
|
const taskIds = benchmarkTaskIDs(d);
|
||||||
|
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||||
|
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||||
|
const streamNext = function(idx, failures) {
|
||||||
|
if (idx >= taskIds.length) {
|
||||||
|
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const taskId = taskIds[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
|
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
benchmarkES.addEventListener('done', function(e) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
if (e.data) failures += 1;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
const isLast = (idx + 1 >= taskIds.length);
|
||||||
|
streamNext(idx + 1, failures);
|
||||||
|
if (isLast) { benchmarkRefreshResults(); }
|
||||||
|
});
|
||||||
|
benchmarkES.onerror = function() {
|
||||||
|
if (benchmarkES) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
streamNext(idx + 1, failures + 1);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
streamNext(0, 0);
|
||||||
|
}).catch(function(err) {
|
||||||
|
status.textContent = 'Error.';
|
||||||
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function benchmarkRenderAutotuneStatus(payload) {
|
||||||
|
const el = document.getElementById('benchmark-autotune-status');
|
||||||
|
if (!el) return;
|
||||||
|
if (!payload || !payload.configured || !payload.config) {
|
||||||
|
el.textContent = 'Autotune status: not configured. Temporary fallback source is used until autotune completes.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const cfg = payload.config || {};
|
||||||
|
const decision = payload.decision || {};
|
||||||
|
const updated = cfg.updated_at ? new Date(cfg.updated_at).toLocaleString() : 'unknown time';
|
||||||
|
const confidence = typeof cfg.confidence === 'number' ? (' · confidence ' + Math.round(cfg.confidence * 100) + '%') : '';
|
||||||
|
const effective = decision.effective_source ? (' · effective ' + decision.effective_source) : '';
|
||||||
|
const mode = decision.mode ? (' · mode ' + decision.mode) : '';
|
||||||
|
el.textContent = 'Autotune status: ' + cfg.selected_source + effective + mode + ' · updated ' + updated + confidence;
|
||||||
|
}
|
||||||
|
function loadBenchmarkAutotuneStatus() {
|
||||||
|
fetch('/api/bee-bench/nvidia/autotune/status')
|
||||||
|
.then(function(r) {
|
||||||
|
return r.json().then(function(body) {
|
||||||
|
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||||
|
return body;
|
||||||
|
});
|
||||||
|
})
|
||||||
|
.then(function(body) { benchmarkRenderAutotuneStatus(body); })
|
||||||
|
.catch(function(err) {
|
||||||
|
const el = document.getElementById('benchmark-autotune-status');
|
||||||
|
if (el) el.textContent = 'Autotune status error: ' + err.message;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runBenchmarkAutotune() {
|
||||||
|
const selected = benchmarkSelectedGPUIndices();
|
||||||
|
const status = document.getElementById('benchmark-run-status');
|
||||||
|
const term = document.getElementById('benchmark-terminal');
|
||||||
|
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||||
|
document.getElementById('benchmark-output').style.display = 'block';
|
||||||
|
document.getElementById('benchmark-title').textContent = '— NVIDIA Benchmark Autotune';
|
||||||
|
term.textContent = 'Enqueuing benchmark autotune...\n';
|
||||||
|
status.textContent = 'Queueing autotune...';
|
||||||
|
fetch('/api/bee-bench/nvidia/autotune/run', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify({
|
||||||
|
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||||
|
benchmark_kind: benchmarkMode() === 'parallel' ? 'performance' : 'power-fit',
|
||||||
|
gpu_indices: selected
|
||||||
|
})
|
||||||
|
}).then(function(r) {
|
||||||
|
return r.json().then(function(payload) {
|
||||||
|
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||||
|
return payload;
|
||||||
|
});
|
||||||
|
}).then(function(d) {
|
||||||
|
const taskIds = benchmarkTaskIDs(d);
|
||||||
|
if (!taskIds.length) throw new Error('No autotune task was queued.');
|
||||||
|
const taskId = taskIds[0];
|
||||||
|
status.textContent = 'Autotune queued: ' + taskId;
|
||||||
|
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
benchmarkES.addEventListener('done', function(e) {
|
||||||
|
if (benchmarkES) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
}
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
status.textContent = e.data ? 'Autotune failed.' : 'Autotune completed.';
|
||||||
|
loadBenchmarkAutotuneStatus();
|
||||||
|
});
|
||||||
|
}).catch(function(err) {
|
||||||
|
status.textContent = 'Autotune error.';
|
||||||
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
benchmarkLoadGPUs();
|
||||||
|
loadBenchmarkAutotuneStatus();
|
||||||
|
function benchmarkRefreshResults() {
|
||||||
|
fetch('/api/benchmark/results')
|
||||||
|
.then(function(r) { return r.text(); })
|
||||||
|
.then(function(html) {
|
||||||
|
const el = document.getElementById('benchmark-results-section');
|
||||||
|
if (el) el.innerHTML = html;
|
||||||
|
})
|
||||||
|
.catch(function() {});
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkResultsCard(exportDir string) string {
|
||||||
|
maxIdx, runs := loadBenchmarkHistory(exportDir)
|
||||||
|
perf := renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Perf Results",
|
||||||
|
"Composite score by saved benchmark run and GPU.",
|
||||||
|
"No saved performance benchmark runs yet.",
|
||||||
|
maxIdx,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
power := renderPowerBenchmarkResultsCard(exportDir)
|
||||||
|
return perf + "\n" + power
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||||
|
if strings.TrimSpace(description) != "" {
|
||||||
|
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`<div style="overflow-x:auto">`)
|
||||||
|
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||||
|
for i := 0; i <= maxGPUIndex; i++ {
|
||||||
|
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
|
for i, run := range runs {
|
||||||
|
b.WriteString(`<tr>`)
|
||||||
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
overallColor := "var(--ok)"
|
||||||
|
overallLabel := run.overallStatus
|
||||||
|
if overallLabel == "" {
|
||||||
|
overallLabel = "OK"
|
||||||
|
}
|
||||||
|
if overallLabel == "FAILED" {
|
||||||
|
overallColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if overallLabel != "OK" {
|
||||||
|
overallColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||||
|
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||||
|
score, ok := run.gpuScores[idx]
|
||||||
|
if !ok {
|
||||||
|
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuStatus := run.gpuStatuses[idx]
|
||||||
|
scoreColor := ""
|
||||||
|
switch gpuStatus {
|
||||||
|
case "FAILED":
|
||||||
|
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||||
|
case "WARNING", "PARTIAL":
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
case "", "OK":
|
||||||
|
default:
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
}
|
||||||
|
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div></div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
|
||||||
|
baseDir := app.DefaultBeeBenchPerfDir
|
||||||
|
if strings.TrimSpace(exportDir) != "" {
|
||||||
|
baseDir = filepath.Join(exportDir, "bee-bench", "perf")
|
||||||
|
}
|
||||||
|
paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
|
||||||
|
if err != nil || len(paths) == 0 {
|
||||||
|
return -1, nil
|
||||||
|
}
|
||||||
|
sort.Strings(paths)
|
||||||
|
return loadBenchmarkHistoryFromPaths(paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
|
||||||
|
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||||
|
maxGPUIndex := -1
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var result platform.NvidiaBenchmarkResult
|
||||||
|
if err := json.Unmarshal(raw, &result); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
run := benchmarkHistoryRun{
|
||||||
|
generatedAt: result.GeneratedAt,
|
||||||
|
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
|
gpuScores: make(map[int]float64),
|
||||||
|
gpuStatuses: make(map[int]string),
|
||||||
|
overallStatus: result.OverallStatus,
|
||||||
|
}
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||||
|
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||||
|
if gpu.Index > maxGPUIndex {
|
||||||
|
maxGPUIndex = gpu.Index
|
||||||
|
}
|
||||||
|
}
|
||||||
|
runs = append(runs, run)
|
||||||
|
}
|
||||||
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
|
})
|
||||||
|
return maxGPUIndex, runs
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||||
|
baseDir := app.DefaultBeeBenchPowerDir
|
||||||
|
if strings.TrimSpace(exportDir) != "" {
|
||||||
|
baseDir = filepath.Join(exportDir, "bee-bench", "power")
|
||||||
|
}
|
||||||
|
paths, err := filepath.Glob(filepath.Join(baseDir, "power-*", "result.json"))
|
||||||
|
if err != nil || len(paths) == 0 {
|
||||||
|
return `<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved power benchmark runs yet.</p></div></div>`
|
||||||
|
}
|
||||||
|
sort.Strings(paths)
|
||||||
|
|
||||||
|
type powerRun struct {
|
||||||
|
generatedAt time.Time
|
||||||
|
displayTime string
|
||||||
|
result platform.NvidiaPowerBenchResult
|
||||||
|
}
|
||||||
|
var runs []powerRun
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var r platform.NvidiaPowerBenchResult
|
||||||
|
if err := json.Unmarshal(raw, &r); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
runs = append(runs, powerRun{
|
||||||
|
generatedAt: r.GeneratedAt,
|
||||||
|
displayTime: r.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
|
result: r,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
|
})
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card" style="margin-top:16px"><div class="card-head">Power / Thermal Fit Results</div><div class="card-body">`)
|
||||||
|
|
||||||
|
latest := runs[0].result
|
||||||
|
b.WriteString(`<p style="font-size:12px;color:var(--muted);margin-bottom:10px">Latest run: ` + html.EscapeString(runs[0].displayTime))
|
||||||
|
if latest.Hostname != "" {
|
||||||
|
b.WriteString(` — ` + html.EscapeString(latest.Hostname))
|
||||||
|
}
|
||||||
|
if latest.OverallStatus != "" {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if latest.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(` — <span style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(latest.OverallStatus) + `</span>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</p>`)
|
||||||
|
|
||||||
|
if len(latest.GPUs) > 0 {
|
||||||
|
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||||
|
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||||
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
|
for _, gpu := range latest.GPUs {
|
||||||
|
finalLimitW := gpu.StablePowerLimitW
|
||||||
|
if finalLimitW <= 0 {
|
||||||
|
finalLimitW = gpu.AppliedPowerLimitW
|
||||||
|
}
|
||||||
|
derated := gpu.Derated ||
|
||||||
|
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||||
|
rowStyle := ""
|
||||||
|
finalStyle := ""
|
||||||
|
if derated {
|
||||||
|
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||||
|
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||||
|
}
|
||||||
|
statusLabel := gpu.Status
|
||||||
|
if statusLabel == "" {
|
||||||
|
statusLabel = "OK"
|
||||||
|
}
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if statusLabel == "FAILED" {
|
||||||
|
statusColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if statusLabel != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
nominalStr := "-"
|
||||||
|
if gpu.DefaultPowerLimitW > 0 {
|
||||||
|
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||||
|
}
|
||||||
|
singleStr := "-"
|
||||||
|
if gpu.AppliedPowerLimitW > 0 {
|
||||||
|
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
multiStr := "-"
|
||||||
|
if gpu.StablePowerLimitW > 0 {
|
||||||
|
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||||
|
}
|
||||||
|
p95Str := "-"
|
||||||
|
if gpu.MaxObservedPowerW > 0 {
|
||||||
|
p95Str = fmt.Sprintf("%.0f", gpu.MaxObservedPowerW)
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr` + rowStyle + `>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||||
|
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||||
|
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||||
|
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(runs) > 1 {
|
||||||
|
b.WriteString(`<details style="margin-top:12px"><summary style="font-size:12px;color:var(--muted);cursor:pointer">` + strconv.Itoa(len(runs)) + ` runs total</summary>`)
|
||||||
|
b.WriteString(`<div style="overflow-x:auto;margin-top:8px"><table><thead><tr><th>#</th><th>Time</th><th>GPUs</th><th>Status</th></tr></thead><tbody>`)
|
||||||
|
for i, run := range runs {
|
||||||
|
statusColor := "var(--ok)"
|
||||||
|
if run.result.OverallStatus != "OK" {
|
||||||
|
statusColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<tr>`)
|
||||||
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + strconv.Itoa(len(run.result.GPUs)) + `</td>`)
|
||||||
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(run.result.OverallStatus) + `</td>`)
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div></details>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
383
audit/internal/webui/page_burn.go
Normal file
383
audit/internal/webui/page_burn.go
Normal file
@@ -0,0 +1,383 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
func renderBurn() string {
|
||||||
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||||
|
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||||
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Burn Profile</div>
|
||||||
|
<div class="card-body burn-profile-body">
|
||||||
|
<div class="burn-profile-col">
|
||||||
|
<div class="form-row" style="margin:0 0 8px"><label>Preset</label></div>
|
||||||
|
<label class="cb-row"><input type="radio" name="burn-profile" value="smoke" checked><span>Smoke — 5 min/GPU (sequential) or 5 min (parallel)</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="burn-profile" value="acceptance"><span>Acceptance — 1 h/GPU (sequential) or 1 h (parallel)</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 h/GPU (sequential) or 8 h (parallel)</span></label>
|
||||||
|
</div>
|
||||||
|
<div class="burn-profile-col burn-profile-action">
|
||||||
|
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||||
|
<p>Runs checked tests as separate sequential tasks. In sequential GPU mode, total time = profile duration × N GPU. In parallel mode, all selected GPUs burn simultaneously for one profile duration.</p>
|
||||||
|
</div>
|
||||||
|
<div class="burn-profile-col burn-profile-action">
|
||||||
|
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||||
|
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||||
|
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">NVIDIA GPU Selection</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA recipes and custom NVIDIA stressors use only the GPUs selected here. Multi-GPU interconnect tests are limited to this selection as well.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
|
</div>
|
||||||
|
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||||
|
<div style="display:flex;flex-direction:column;gap:4px;margin-top:10px">
|
||||||
|
<label class="cb-row">
|
||||||
|
<input type="radio" name="burn-nvidia-mode" value="sequential" checked>
|
||||||
|
<span>Sequential — selected GPUs one at a time</span>
|
||||||
|
</label>
|
||||||
|
<label class="cb-row" id="burn-parallel-label">
|
||||||
|
<input type="radio" name="burn-nvidia-mode" value="parallel">
|
||||||
|
<span>Parallel — all selected GPUs simultaneously</span>
|
||||||
|
</label>
|
||||||
|
<label class="cb-row" id="burn-ramp-label">
|
||||||
|
<input type="radio" name="burn-nvidia-mode" value="ramp-up">
|
||||||
|
<span>Ramp-up — add one GPU at a time</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="burn-section">Core Burn Paths</div>
|
||||||
|
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||||
|
<div class="card burn-card">
|
||||||
|
<div class="card-head card-head-actions"><span>GPU Max Load</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'}])">Run</button></div>
|
||||||
|
<div class="card-body burn-card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Combine vendor-backed and custom GPU max-load recipes in one run set. ` + "dcgmproftester" + ` is the primary official NVIDIA path; custom stressors remain available as parallel checkbox options.</p>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-nvidia-compute" checked disabled><span>NVIDIA Max Compute Load (dcgmproftester) <span class="cb-note" id="note-nvidia-compute"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" checked disabled><span>GPU Burn (bee-gpu-burn) <span class="cb-note" id="note-bee"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" disabled><span>John GPU Stress (john/OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" disabled><span>AMD GPU Stress (rvs gst) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card burn-card">
|
||||||
|
<div class="card-head card-head-actions"><span>Compute Stress</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'}])">Run</button></div>
|
||||||
|
<div class="card-body burn-card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||||
|
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||||
|
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||||
|
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||||
|
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||||
|
.burn-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.burn-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
.burn-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||||
|
.burn-profile-col { min-width:0; }
|
||||||
|
.burn-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:flex-start; gap:8px; }
|
||||||
|
.burn-profile-action p { font-size:12px; color:var(--muted); margin:0; width:100%; text-align:left; }
|
||||||
|
.burn-section { font-size:12px; font-weight:700; letter-spacing:.06em; text-transform:uppercase; color:var(--muted); margin:0 0 10px; padding-top:4px; }
|
||||||
|
.burn-grid { align-items:stretch; }
|
||||||
|
.burn-card { height:100%; display:flex; flex-direction:column; }
|
||||||
|
.burn-card-body { flex:1; display:flex; flex-direction:column; }
|
||||||
|
.card-head-actions { justify-content:space-between; }
|
||||||
|
.card-head-buttons { display:flex; align-items:center; gap:8px; margin-left:auto; }
|
||||||
|
@media(max-width:900px){ .card-head-actions { align-items:flex-start; flex-direction:column; } .card-head-buttons { margin-left:0; } .burn-profile-body { grid-template-columns:1fr; } }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let biES = null;
|
||||||
|
function burnTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
function burnProfile() {
|
||||||
|
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||||
|
return selected ? selected.value : 'smoke';
|
||||||
|
}
|
||||||
|
function burnSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.burn-gpu-checkbox'))
|
||||||
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
|
.map(function(el) { return parseInt(el.value, 10); })
|
||||||
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
|
.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
function burnNvidiaMode() {
|
||||||
|
const el = document.querySelector('input[name="burn-nvidia-mode"]:checked');
|
||||||
|
return el ? el.value : 'sequential';
|
||||||
|
}
|
||||||
|
function burnApplyMultiGPUState(gpuCount) {
|
||||||
|
var multiValues = ['parallel', 'ramp-up'];
|
||||||
|
var radios = document.querySelectorAll('input[name="burn-nvidia-mode"]');
|
||||||
|
radios.forEach(function(el) {
|
||||||
|
var isMulti = multiValues.indexOf(el.value) >= 0;
|
||||||
|
if (gpuCount < 2 && isMulti) {
|
||||||
|
el.disabled = true;
|
||||||
|
if (el.checked) {
|
||||||
|
var seq = document.querySelector('input[name="burn-nvidia-mode"][value="sequential"]');
|
||||||
|
if (seq) seq.checked = true;
|
||||||
|
}
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '0.4';
|
||||||
|
} else {
|
||||||
|
el.disabled = false;
|
||||||
|
var label = el.closest('label');
|
||||||
|
if (label) label.style.opacity = '';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function burnUpdateSelectionNote() {
|
||||||
|
const note = document.getElementById('burn-selection-note');
|
||||||
|
const selected = burnSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA burn recipes.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '. Official and custom NVIDIA tasks will use only these GPUs.';
|
||||||
|
}
|
||||||
|
function burnRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('burn-gpu-list');
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(function(gpu) {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="burn-gpu-row">'
|
||||||
|
+ '<input class="burn-gpu-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="burnUpdateSelectionNote()">'
|
||||||
|
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||||
|
+ '</label>';
|
||||||
|
}).join('');
|
||||||
|
burnApplyMultiGPUState(gpus.length);
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function burnSelectAll() {
|
||||||
|
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = true; });
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function burnSelectNone() {
|
||||||
|
document.querySelectorAll('.burn-gpu-checkbox').forEach(function(el) { el.checked = false; });
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
}
|
||||||
|
function burnLoadGPUs() {
|
||||||
|
fetch('/api/gpu/nvidia').then(function(r) {
|
||||||
|
return r.json().then(function(body) {
|
||||||
|
if (!r.ok) throw new Error(body.error || ('HTTP ' + r.status));
|
||||||
|
return body;
|
||||||
|
});
|
||||||
|
}).then(function(gpus) {
|
||||||
|
burnRenderGPUList(gpus);
|
||||||
|
}).catch(function(err) {
|
||||||
|
document.getElementById('burn-gpu-list').innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
burnUpdateSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||||
|
const body = Object.assign({ profile: burnProfile(), display_name: label }, extra || {});
|
||||||
|
if (useSelectedNvidia) {
|
||||||
|
const selected = burnSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||||
|
}
|
||||||
|
body.gpu_indices = selected;
|
||||||
|
const bMode = burnNvidiaMode();
|
||||||
|
if (bMode === 'ramp-up' && selected.length > 1) {
|
||||||
|
body.stagger_gpu_start = true;
|
||||||
|
} else if (bMode === 'parallel' && selected.length > 1) {
|
||||||
|
body.parallel_gpus = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fetch('/api/sat/' + target + '/run', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(body)
|
||||||
|
}).then(function(r) {
|
||||||
|
return r.json().then(function(payload) {
|
||||||
|
if (!r.ok) throw new Error(payload.error || ('HTTP ' + r.status));
|
||||||
|
return payload;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function streamTask(taskId, label) {
|
||||||
|
if (biES) { biES.close(); biES = null; }
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||||
|
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', function(e) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function streamBurnTask(taskId, label, resetTerminal) {
|
||||||
|
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||||
|
}
|
||||||
|
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||||
|
if (biES) { biES.close(); biES = null; }
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
if (resetTerminal) {
|
||||||
|
term.textContent = '';
|
||||||
|
}
|
||||||
|
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||||
|
term.textContent += 'ERROR: no tasks queued.\n';
|
||||||
|
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||||
|
}
|
||||||
|
const streamNext = function(idx, failures) {
|
||||||
|
if (idx >= taskIds.length) {
|
||||||
|
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||||
|
}
|
||||||
|
const taskId = taskIds[idx];
|
||||||
|
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||||
|
return new Promise(function(resolve) {
|
||||||
|
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', function(e) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve(failures + (e.data ? 1 : 0));
|
||||||
|
});
|
||||||
|
biES.onerror = function() {
|
||||||
|
if (biES) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve(failures + 1);
|
||||||
|
};
|
||||||
|
}).then(function(nextFailures) {
|
||||||
|
return streamNext(idx + 1, nextFailures);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return streamNext(0, 0);
|
||||||
|
}
|
||||||
|
function runBurnTaskSet(tasks, statusElId) {
|
||||||
|
const enabled = tasks.filter(function(t) {
|
||||||
|
const el = document.getElementById(t.id);
|
||||||
|
return el && el.checked && !el.disabled;
|
||||||
|
});
|
||||||
|
const status = statusElId ? document.getElementById(statusElId) : null;
|
||||||
|
if (status) status.textContent = '';
|
||||||
|
if (!enabled.length) {
|
||||||
|
if (status) status.textContent = 'No tasks selected.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const term = document.getElementById('bi-terminal');
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
document.getElementById('bi-title').textContent = '— Burn one by one [' + burnProfile() + ']';
|
||||||
|
term.textContent = '';
|
||||||
|
const runNext = function(idx) {
|
||||||
|
if (idx >= enabled.length) {
|
||||||
|
if (status) status.textContent = 'Completed ' + enabled.length + ' task(s).';
|
||||||
|
return Promise.resolve();
|
||||||
|
}
|
||||||
|
const t = enabled[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + enabled.length + '] ' + t.label + '\n';
|
||||||
|
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||||
|
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||||
|
.then(function(d) {
|
||||||
|
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||||
|
})
|
||||||
|
.then(function() {
|
||||||
|
return runNext(idx + 1);
|
||||||
|
})
|
||||||
|
.catch(function(err) {
|
||||||
|
if (status) status.textContent = 'Error: ' + err.message;
|
||||||
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
|
return Promise.reject(err);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
}
|
||||||
|
function runPlatformStress() {
|
||||||
|
const comps = [];
|
||||||
|
const computeIDs = ['burn-cpu', 'burn-mem-stress', 'burn-sat-stress'];
|
||||||
|
const gpuIDs = ['burn-nvidia-compute', 'burn-gpu-bee', 'burn-gpu-john', 'burn-gpu-rvs'];
|
||||||
|
const hasChecked = function(ids) {
|
||||||
|
return ids.some(function(id) {
|
||||||
|
const el = document.getElementById(id);
|
||||||
|
return el && el.checked && !el.disabled;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
if (hasChecked(computeIDs)) comps.push('cpu');
|
||||||
|
if (hasChecked(gpuIDs)) comps.push('gpu');
|
||||||
|
if (!comps.length) {
|
||||||
|
const status = document.getElementById('burn-all-status');
|
||||||
|
if (status) status.textContent = 'Select at least one test in GPU Max Load or Compute Stress.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||||
|
enqueueBurnTask('platform-stress', 'Platform Thermal Cycling', extra, false).then(function(d) {
|
||||||
|
streamTask(d.task_id, 'Platform Thermal Cycling');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runAllBurnTasks() {
|
||||||
|
const status = document.getElementById('burn-all-status');
|
||||||
|
const all = [
|
||||||
|
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||||
|
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||||
|
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||||
|
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||||
|
{id:'burn-cpu',target:'cpu',label:'CPU Burn-in'},
|
||||||
|
{id:'burn-mem-stress',target:'memory-stress',label:'Memory Burn-in'},
|
||||||
|
{id:'burn-sat-stress',target:'sat-stress',label:'SAT Stress (stressapptest)'},
|
||||||
|
];
|
||||||
|
status.textContent = 'Enqueuing...';
|
||||||
|
runBurnTaskSet(all, 'burn-all-status');
|
||||||
|
}
|
||||||
|
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||||
|
const map = {
|
||||||
|
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||||
|
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||||
|
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||||
|
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||||
|
};
|
||||||
|
tools.forEach(function(t) {
|
||||||
|
const spec = map[t.id];
|
||||||
|
if (!spec) return;
|
||||||
|
const cb = document.getElementById(spec.cb);
|
||||||
|
const note = document.getElementById(spec.note);
|
||||||
|
if (!cb) return;
|
||||||
|
if (t.available) {
|
||||||
|
cb.disabled = false;
|
||||||
|
} else if (note) {
|
||||||
|
note.textContent = '— ' + spec.reason;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}).catch(function() {});
|
||||||
|
burnLoadGPUs();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
510
audit/internal/webui/page_export_tools.go
Normal file
510
audit/internal/webui/page_export_tools.go
Normal file
@@ -0,0 +1,510 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func renderExport(exportDir string) string {
|
||||||
|
entries, _ := listExportFiles(exportDir)
|
||||||
|
var rows strings.Builder
|
||||||
|
for _, e := range entries {
|
||||||
|
rows.WriteString(fmt.Sprintf(`<tr><td><a href="/export/file?path=%s" target="_blank">%s</a></td></tr>`,
|
||||||
|
url.QueryEscape(e), html.EscapeString(e)))
|
||||||
|
}
|
||||||
|
if len(entries) == 0 {
|
||||||
|
rows.WriteString(`<tr><td style="color:var(--muted)">No export files found.</td></tr>`)
|
||||||
|
}
|
||||||
|
return `<div class="grid2">
|
||||||
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
|
` + renderSupportBundleInline() + `
|
||||||
|
</div></div>
|
||||||
|
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||||
|
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||||
|
</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
` + renderUSBExportCard()
|
||||||
|
}
|
||||||
|
|
||||||
|
func listExportFiles(exportDir string) ([]string, error) {
|
||||||
|
var entries []string
|
||||||
|
err := filepath.Walk(strings.TrimSpace(exportDir), func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if info.IsDir() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(exportDir, path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
entries = append(entries, rel)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
sort.Strings(entries)
|
||||||
|
return entries, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderSupportBundleInline() string {
|
||||||
|
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||||
|
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||||
|
<script>
|
||||||
|
window.supportBundleDownload = function() {
|
||||||
|
var btn = document.getElementById('support-bundle-btn');
|
||||||
|
var status = document.getElementById('support-bundle-status');
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Building...';
|
||||||
|
status.textContent = 'Collecting logs and export data\u2026';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
var filename = 'bee-support.tar.gz';
|
||||||
|
fetch('/export/support.tar.gz')
|
||||||
|
.then(function(r) {
|
||||||
|
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||||
|
var cd = r.headers.get('Content-Disposition') || '';
|
||||||
|
var m = cd.match(/filename="?([^";]+)"?/);
|
||||||
|
if (m) filename = m[1];
|
||||||
|
return r.blob();
|
||||||
|
})
|
||||||
|
.then(function(blob) {
|
||||||
|
var url = URL.createObjectURL(blob);
|
||||||
|
var a = document.createElement('a');
|
||||||
|
a.href = url;
|
||||||
|
a.download = filename;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
document.body.removeChild(a);
|
||||||
|
URL.revokeObjectURL(url);
|
||||||
|
status.textContent = 'Download started.';
|
||||||
|
status.style.color = 'var(--ok-fg)';
|
||||||
|
})
|
||||||
|
.catch(function(e) {
|
||||||
|
status.textContent = 'Error: ' + e.message;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
})
|
||||||
|
.finally(function() {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = '\u2195 Download Support Bundle';
|
||||||
|
});
|
||||||
|
};
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderUSBExportCard() string {
|
||||||
|
return `<div class="card" style="margin-top:16px">
|
||||||
|
<div class="card-head">USB Black-Box
|
||||||
|
<button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||||
|
</div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderUSBExportInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
|
||||||
|
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||||
|
<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
|
||||||
|
<div id="usb-targets" style="margin-top:12px"></div>
|
||||||
|
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||||
|
<script>
|
||||||
|
(function(){
|
||||||
|
function blackboxRefresh() {
|
||||||
|
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||||
|
document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
|
||||||
|
document.getElementById('usb-targets').innerHTML = '';
|
||||||
|
document.getElementById('usb-msg').textContent = '';
|
||||||
|
Promise.all([
|
||||||
|
fetch('/api/export/usb').then(r=>r.json()),
|
||||||
|
fetch('/api/blackbox/status').then(r=>r.json())
|
||||||
|
]).then(function(values) {
|
||||||
|
const targets = Array.isArray(values[0]) ? values[0] : [];
|
||||||
|
const state = values[1] || {};
|
||||||
|
const active = Array.isArray(state.targets) ? state.targets : [];
|
||||||
|
window._usbTargets = targets;
|
||||||
|
window._blackboxTargets = active;
|
||||||
|
const st = document.getElementById('usb-status');
|
||||||
|
const ct = document.getElementById('usb-targets');
|
||||||
|
const summary = document.getElementById('blackbox-summary');
|
||||||
|
if (state.boot_folder) {
|
||||||
|
summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
|
||||||
|
} else {
|
||||||
|
summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
|
||||||
|
}
|
||||||
|
if (!targets || targets.length === 0) {
|
||||||
|
st.textContent = 'No removable USB devices found.';
|
||||||
|
} else {
|
||||||
|
st.textContent = targets.length + ' device(s) found:';
|
||||||
|
}
|
||||||
|
const byDevice = {};
|
||||||
|
active.forEach(function(item) { byDevice[item.device] = item; });
|
||||||
|
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
|
||||||
|
targets.map((t, idx) => {
|
||||||
|
const dev = t.device || '';
|
||||||
|
const label = t.label || '';
|
||||||
|
const model = t.model || '';
|
||||||
|
const state = byDevice[dev];
|
||||||
|
const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
|
||||||
|
const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
|
||||||
|
return '<tr>' +
|
||||||
|
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||||
|
'<td>'+t.fs_type+'</td>' +
|
||||||
|
'<td>'+t.size+'</td>' +
|
||||||
|
'<td>'+label+'</td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||||
|
'<td style="font-size:12px">'+status+detail+'</td>' +
|
||||||
|
'<td style="white-space:nowrap">' +
|
||||||
|
(state
|
||||||
|
? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
|
||||||
|
: '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
|
||||||
|
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||||
|
'</td></tr>';
|
||||||
|
}).join('') + '</table>';
|
||||||
|
}).catch(e => {
|
||||||
|
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
window.blackboxEnable = function(targetIndex, btn) {
|
||||||
|
const target = (window._usbTargets || [])[targetIndex];
|
||||||
|
if (!target) {
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: USB target not found. Refresh and try again.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
const row = btn ? btn.closest('td') : null;
|
||||||
|
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||||
|
const originalText = btn ? btn.textContent : '';
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Enabling...';
|
||||||
|
}
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Working...';
|
||||||
|
}
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
|
||||||
|
fetch('/api/blackbox/enable', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(target)
|
||||||
|
}).then(async r => {
|
||||||
|
const d = await r.json();
|
||||||
|
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||||
|
return d;
|
||||||
|
}).then(d => {
|
||||||
|
msg.style.color = 'var(--ok,green)';
|
||||||
|
msg.textContent = d.message || 'Done.';
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--ok,green)';
|
||||||
|
rowMsg.textContent = d.message || 'Done.';
|
||||||
|
}
|
||||||
|
}).catch(e => {
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: '+e;
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--err,red)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e;
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = originalText;
|
||||||
|
}
|
||||||
|
setTimeout(blackboxRefresh, 300);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
window.blackboxDisable = function(targetIndex, btn) {
|
||||||
|
const target = (window._usbTargets || [])[targetIndex];
|
||||||
|
const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
|
||||||
|
if (!target || !active) {
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: black-box target not found. Refresh and try again.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
const row = btn ? btn.closest('td') : null;
|
||||||
|
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||||
|
const originalText = btn ? btn.textContent : '';
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Disabling...';
|
||||||
|
}
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Working...';
|
||||||
|
}
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
|
||||||
|
fetch('/api/blackbox/disable', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
|
||||||
|
}).then(async r => {
|
||||||
|
const d = await r.json();
|
||||||
|
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||||
|
return d;
|
||||||
|
}).then(d => {
|
||||||
|
msg.style.color = 'var(--ok,green)';
|
||||||
|
msg.textContent = d.message || 'Done.';
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--ok,green)';
|
||||||
|
rowMsg.textContent = d.message || 'Done.';
|
||||||
|
}
|
||||||
|
}).catch(e => {
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: '+e;
|
||||||
|
if (rowMsg) {
|
||||||
|
rowMsg.style.color = 'var(--err,red)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e;
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
if (btn) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = originalText;
|
||||||
|
}
|
||||||
|
setTimeout(blackboxRefresh, 300);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
window.blackboxRefresh = blackboxRefresh;
|
||||||
|
blackboxRefresh();
|
||||||
|
})();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNvidiaSelfHealInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:12px">
|
||||||
|
<button id="nvidia-restart-btn" class="btn btn-secondary" onclick="nvidiaRestartDrivers()">Restart GPU Drivers</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" onclick="loadNvidiaSelfHeal()">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div id="nvidia-self-heal-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVIDIA GPU status...</div>
|
||||||
|
<div id="nvidia-self-heal-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<div id="nvidia-self-heal-out" style="display:none;margin-top:12px">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
|
<span id="nvidia-self-heal-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||||
|
<span id="nvidia-self-heal-out-status" style="font-size:12px"></span>
|
||||||
|
</div>
|
||||||
|
<div id="nvidia-self-heal-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function nvidiaSelfHealShowResult(label, status, output) {
|
||||||
|
var out = document.getElementById('nvidia-self-heal-out');
|
||||||
|
var term = document.getElementById('nvidia-self-heal-terminal');
|
||||||
|
var statusEl = document.getElementById('nvidia-self-heal-out-status');
|
||||||
|
var labelEl = document.getElementById('nvidia-self-heal-out-label');
|
||||||
|
out.style.display = 'block';
|
||||||
|
labelEl.textContent = label;
|
||||||
|
term.textContent = output || '(no output)';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
if (status === 'ok') {
|
||||||
|
statusEl.textContent = '✓ done';
|
||||||
|
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = '✗ failed';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function nvidiaRestartDrivers() {
|
||||||
|
var btn = document.getElementById('nvidia-restart-btn');
|
||||||
|
var original = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Restarting...';
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', 'ok', 'Running...');
|
||||||
|
fetch('/api/services/action', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({name:'bee-nvidia', action:'restart'})
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', d.status || 'error', d.output || d.error || '(no output)');
|
||||||
|
setTimeout(function() {
|
||||||
|
loadServices();
|
||||||
|
loadNvidiaSelfHeal();
|
||||||
|
}, 800);
|
||||||
|
}).catch(e => {
|
||||||
|
nvidiaSelfHealShowResult('restart bee-nvidia', 'error', 'Request failed: ' + e);
|
||||||
|
}).finally(() => {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = original;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function nvidiaResetGPU(index, btn) {
|
||||||
|
var original = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Resetting...';
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, 'ok', 'Running...');
|
||||||
|
fetch('/api/gpu/nvidia-reset', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({index:index})
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, d.status || 'error', d.output || '(no output)');
|
||||||
|
setTimeout(loadNvidiaSelfHeal, 1000);
|
||||||
|
}).catch(e => {
|
||||||
|
nvidiaSelfHealShowResult('reset gpu ' + index, 'error', 'Request failed: ' + e);
|
||||||
|
}).finally(() => {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = original;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadNvidiaSelfHeal() {
|
||||||
|
var status = document.getElementById('nvidia-self-heal-status');
|
||||||
|
var table = document.getElementById('nvidia-self-heal-table');
|
||||||
|
status.textContent = 'Loading NVIDIA GPU status...';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||||
|
fetch('/api/gpu/nvidia-status').then(r=>r.json()).then(gpus => {
|
||||||
|
if (!Array.isArray(gpus) || gpus.length === 0) {
|
||||||
|
status.textContent = 'No NVIDIA GPUs detected or nvidia-smi is unavailable.';
|
||||||
|
table.innerHTML = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
status.textContent = gpus.length + ' NVIDIA GPU(s) detected.';
|
||||||
|
const rows = gpus.map(g => {
|
||||||
|
const serial = g.serial || '';
|
||||||
|
const bdf = g.bdf || '';
|
||||||
|
const id = serial || bdf || ('gpu-' + g.index);
|
||||||
|
const badge = g.status === 'OK' ? 'badge-ok' : g.status === 'RESET_REQUIRED' ? 'badge-err' : 'badge-warn';
|
||||||
|
const details = [];
|
||||||
|
if (serial) details.push('serial ' + serial);
|
||||||
|
if (bdf) details.push('bdf ' + bdf);
|
||||||
|
if (g.parse_failure && g.raw_line) details.push(g.raw_line);
|
||||||
|
return '<tr>'
|
||||||
|
+ '<td style="white-space:nowrap">' + g.index + '</td>'
|
||||||
|
+ '<td>' + (g.name || 'unknown') + '</td>'
|
||||||
|
+ '<td style="font-family:monospace">' + id + '</td>'
|
||||||
|
+ '<td><span class="badge ' + badge + '">' + (g.status || 'UNKNOWN') + '</span>'
|
||||||
|
+ (details.length ? '<div style="margin-top:4px;font-size:12px;color:var(--muted)">' + details.join(' | ') + '</div>' : '')
|
||||||
|
+ '</td>'
|
||||||
|
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-secondary" onclick="nvidiaResetGPU(' + g.index + ', this)">Reset GPU</button></td>'
|
||||||
|
+ '</tr>';
|
||||||
|
}).join('');
|
||||||
|
table.innerHTML = '<table><tr><th>GPU</th><th>Model</th><th>ID</th><th>Status</th><th>Action</th></tr>' + rows + '</table>';
|
||||||
|
}).catch(e => {
|
||||||
|
status.textContent = 'Error loading NVIDIA GPU status: ' + e;
|
||||||
|
status.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
table.innerHTML = '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNvidiaSelfHeal();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTools() string {
|
||||||
|
return `<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">System Install</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div style="margin-bottom:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||||
|
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||||
|
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||||
|
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||||
|
</div>
|
||||||
|
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||||
|
renderInstallInline() + `
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||||
|
const boot = document.getElementById('boot-source-text');
|
||||||
|
const txt = document.getElementById('ram-status-text');
|
||||||
|
const btn = document.getElementById('ram-install-btn');
|
||||||
|
let source = d.device || d.source || 'unknown source';
|
||||||
|
let kind = d.kind || 'unknown';
|
||||||
|
let label = source;
|
||||||
|
if (kind === 'ram') label = 'RAM';
|
||||||
|
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||||
|
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||||
|
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||||
|
else label = source;
|
||||||
|
boot.textContent = 'Current boot source: ' + label + '.';
|
||||||
|
txt.textContent = d.message || 'Checking...';
|
||||||
|
if (d.status === 'ok' || d.in_ram) {
|
||||||
|
txt.style.color = 'var(--ok, green)';
|
||||||
|
} else if (d.status === 'failed') {
|
||||||
|
txt.style.color = 'var(--err, #b91c1c)';
|
||||||
|
} else {
|
||||||
|
txt.style.color = 'var(--muted)';
|
||||||
|
}
|
||||||
|
if (d.can_start_task) {
|
||||||
|
btn.style.display = '';
|
||||||
|
btn.disabled = false;
|
||||||
|
} else {
|
||||||
|
btn.style.display = 'none';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
function installToRAM() {
|
||||||
|
document.getElementById('ram-install-btn').disabled = true;
|
||||||
|
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||||
|
window.location.href = '/tasks#' + d.task_id;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
|
` + renderSupportBundleInline() + `
|
||||||
|
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||||
|
` + renderUSBExportInline() + `
|
||||||
|
</div>
|
||||||
|
</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||||
|
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||||
|
renderNvidiaSelfHealInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||||
|
renderNetworkInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||||
|
renderServicesInline() + `</div></div>
|
||||||
|
|
||||||
|
|
||||||
|
<script>
|
||||||
|
function checkTools() {
|
||||||
|
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||||
|
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||||
|
const rows = tools.map(t =>
|
||||||
|
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('tools-table').innerHTML =
|
||||||
|
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
checkTools();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderExportIndex(exportDir string) (string, error) {
|
||||||
|
entries, err := listExportFiles(exportDir)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
body.WriteString(`<!DOCTYPE html><html><head><meta charset="utf-8"><title>Bee Export Files</title></head><body>`)
|
||||||
|
body.WriteString(`<h1>Bee Export Files</h1><ul>`)
|
||||||
|
for _, entry := range entries {
|
||||||
|
body.WriteString(`<li><a href="/export/file?path=` + url.QueryEscape(entry) + `">` + html.EscapeString(entry) + `</a></li>`)
|
||||||
|
}
|
||||||
|
if len(entries) == 0 {
|
||||||
|
body.WriteString(`<li>No export files found.</li>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`</ul></body></html>`)
|
||||||
|
return body.String(), nil
|
||||||
|
}
|
||||||
314
audit/internal/webui/page_install_tasks.go
Normal file
314
audit/internal/webui/page_install_tasks.go
Normal file
@@ -0,0 +1,314 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
func renderInstallInline() string {
|
||||||
|
return `
|
||||||
|
<div class="alert alert-warn" style="margin-bottom:16px">
|
||||||
|
<strong>Warning:</strong> Installing will <strong>completely erase</strong> the selected
|
||||||
|
disk and write the live system onto it. All existing data on the target disk will be lost.
|
||||||
|
This operation cannot be undone.
|
||||||
|
</div>
|
||||||
|
<div id="install-loading" style="color:var(--muted);font-size:13px">Loading disk list…</div>
|
||||||
|
<div id="install-disk-section" style="display:none">
|
||||||
|
<div class="card" style="margin-bottom:0">
|
||||||
|
<table id="install-disk-table">
|
||||||
|
<thead><tr><th></th><th>Device</th><th>Model</th><th>Size</th><th>Status</th></tr></thead>
|
||||||
|
<tbody id="install-disk-tbody"></tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<div style="margin-top:12px">
|
||||||
|
<button class="btn btn-secondary btn-sm" onclick="installRefreshDisks()">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="install-confirm-section" style="display:none;margin-top:20px">
|
||||||
|
<div id="install-confirm-warn" class="alert" style="background:#fff6f6;border:1px solid #e0b4b4;color:#9f3a38;font-size:13px"></div>
|
||||||
|
<div class="form-row" style="max-width:360px">
|
||||||
|
<label>Type the device name to confirm (e.g. /dev/sda)</label>
|
||||||
|
<input type="text" id="install-confirm-input" placeholder="/dev/..." oninput="installCheckConfirm()" autocomplete="off" spellcheck="false">
|
||||||
|
</div>
|
||||||
|
<button class="btn btn-danger" id="install-start-btn" disabled onclick="installStart()">Install to Disk</button>
|
||||||
|
<button class="btn btn-secondary" style="margin-left:8px" onclick="installDeselect()">Cancel</button>
|
||||||
|
</div>
|
||||||
|
<div id="install-progress-section" style="display:none;margin-top:20px">
|
||||||
|
<div class="card-head" style="margin-bottom:8px">Installation Progress</div>
|
||||||
|
<div id="install-terminal" class="terminal" style="max-height:500px"></div>
|
||||||
|
<div id="install-status" style="margin-top:12px;font-size:13px"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
#install-disk-tbody tr{cursor:pointer}
|
||||||
|
#install-disk-tbody tr.selected td{background:rgba(33,133,208,.1)}
|
||||||
|
#install-disk-tbody tr:hover td{background:rgba(33,133,208,.07)}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
var _installSelected = null;
|
||||||
|
|
||||||
|
function installRefreshDisks() {
|
||||||
|
document.getElementById('install-loading').style.display = '';
|
||||||
|
document.getElementById('install-disk-section').style.display = 'none';
|
||||||
|
document.getElementById('install-confirm-section').style.display = 'none';
|
||||||
|
_installSelected = null;
|
||||||
|
fetch('/api/install/disks').then(function(r){ return r.json(); }).then(function(disks){
|
||||||
|
document.getElementById('install-loading').style.display = 'none';
|
||||||
|
var tbody = document.getElementById('install-disk-tbody');
|
||||||
|
tbody.innerHTML = '';
|
||||||
|
if (!disks || disks.length === 0) {
|
||||||
|
tbody.innerHTML = '<tr><td colspan="5" style="color:var(--muted);text-align:center">No installable disks found</td></tr>';
|
||||||
|
} else {
|
||||||
|
disks.forEach(function(d) {
|
||||||
|
var warnings = (d.warnings || []);
|
||||||
|
var statusHtml;
|
||||||
|
if (warnings.length === 0) {
|
||||||
|
statusHtml = '<span class="badge badge-ok">OK</span>';
|
||||||
|
} else {
|
||||||
|
var hasSmall = warnings.some(function(w){ return w.indexOf('too small') >= 0; });
|
||||||
|
statusHtml = warnings.map(function(w){
|
||||||
|
var cls = hasSmall ? 'badge-err' : 'badge-warn';
|
||||||
|
return '<span class="badge ' + cls + '" title="' + w.replace(/"/g,'"') + '">' +
|
||||||
|
(w.length > 40 ? w.substring(0,38)+'…' : w) + '</span>';
|
||||||
|
}).join(' ');
|
||||||
|
}
|
||||||
|
var mountedNote = (d.mounted_parts && d.mounted_parts.length > 0)
|
||||||
|
? ' <span style="color:var(--warn-fg);font-size:11px">(mounted)</span>' : '';
|
||||||
|
var tr = document.createElement('tr');
|
||||||
|
tr.dataset.device = d.device;
|
||||||
|
tr.dataset.model = d.model || 'Unknown';
|
||||||
|
tr.dataset.size = d.size;
|
||||||
|
tr.dataset.warnings = JSON.stringify(warnings);
|
||||||
|
tr.innerHTML =
|
||||||
|
'<td><input type="radio" name="install-disk" value="' + d.device + '"></td>' +
|
||||||
|
'<td><code>' + d.device + '</code>' + mountedNote + '</td>' +
|
||||||
|
'<td>' + (d.model || '—') + '</td>' +
|
||||||
|
'<td>' + d.size + '</td>' +
|
||||||
|
'<td>' + statusHtml + '</td>';
|
||||||
|
tr.addEventListener('click', function(){ installSelectDisk(this); });
|
||||||
|
tbody.appendChild(tr);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
document.getElementById('install-disk-section').style.display = '';
|
||||||
|
}).catch(function(e){
|
||||||
|
document.getElementById('install-loading').textContent = 'Failed to load disk list: ' + e;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function installSelectDisk(tr) {
|
||||||
|
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||||
|
tr.classList.add('selected');
|
||||||
|
var radio = tr.querySelector('input[type=radio]');
|
||||||
|
if (radio) radio.checked = true;
|
||||||
|
_installSelected = {
|
||||||
|
device: tr.dataset.device,
|
||||||
|
model: tr.dataset.model,
|
||||||
|
size: tr.dataset.size,
|
||||||
|
warnings: JSON.parse(tr.dataset.warnings || '[]')
|
||||||
|
};
|
||||||
|
var warnBox = document.getElementById('install-confirm-warn');
|
||||||
|
var warnLines = '<strong>⚠ DANGER:</strong> ' + _installSelected.device +
|
||||||
|
' (' + _installSelected.model + ', ' + _installSelected.size + ')' +
|
||||||
|
' will be <strong>completely erased</strong> and repartitioned. All data will be lost.<br>';
|
||||||
|
if (_installSelected.warnings.length > 0) {
|
||||||
|
warnLines += '<br>' + _installSelected.warnings.map(function(w){ return '• ' + w; }).join('<br>');
|
||||||
|
}
|
||||||
|
warnBox.innerHTML = warnLines;
|
||||||
|
document.getElementById('install-confirm-input').value = '';
|
||||||
|
document.getElementById('install-start-btn').disabled = true;
|
||||||
|
document.getElementById('install-confirm-section').style.display = '';
|
||||||
|
document.getElementById('install-progress-section').style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
function installDeselect() {
|
||||||
|
_installSelected = null;
|
||||||
|
document.querySelectorAll('#install-disk-tbody tr').forEach(function(r){ r.classList.remove('selected'); });
|
||||||
|
document.querySelectorAll('#install-disk-tbody input[type=radio]').forEach(function(r){ r.checked = false; });
|
||||||
|
document.getElementById('install-confirm-section').style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
function installCheckConfirm() {
|
||||||
|
var val = document.getElementById('install-confirm-input').value.trim();
|
||||||
|
var ok = _installSelected && val === _installSelected.device;
|
||||||
|
document.getElementById('install-start-btn').disabled = !ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
function installStart() {
|
||||||
|
if (!_installSelected) return;
|
||||||
|
document.getElementById('install-confirm-section').style.display = 'none';
|
||||||
|
document.getElementById('install-disk-section').style.display = 'none';
|
||||||
|
document.getElementById('install-loading').style.display = 'none';
|
||||||
|
var prog = document.getElementById('install-progress-section');
|
||||||
|
var term = document.getElementById('install-terminal');
|
||||||
|
var status = document.getElementById('install-status');
|
||||||
|
prog.style.display = '';
|
||||||
|
term.textContent = '';
|
||||||
|
status.textContent = 'Starting installation…';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
|
||||||
|
fetch('/api/install/run', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
body: JSON.stringify({device: _installSelected.device})
|
||||||
|
}).then(function(r){
|
||||||
|
return r.json().then(function(j){
|
||||||
|
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||||
|
return j;
|
||||||
|
});
|
||||||
|
}).then(function(j){
|
||||||
|
if (!j.task_id) throw new Error('missing task id');
|
||||||
|
installStreamLog(j.task_id);
|
||||||
|
}).catch(function(e){
|
||||||
|
status.textContent = 'Error: ' + e;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function installStreamLog(taskId) {
|
||||||
|
var term = document.getElementById('install-terminal');
|
||||||
|
var status = document.getElementById('install-status');
|
||||||
|
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
es.onmessage = function(e) {
|
||||||
|
term.textContent += e.data + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
};
|
||||||
|
es.addEventListener('done', function(e) {
|
||||||
|
es.close();
|
||||||
|
if (!e.data) {
|
||||||
|
status.innerHTML = '<span style="color:var(--ok-fg);font-weight:700">✓ Installation complete.</span> Remove the ISO and reboot.';
|
||||||
|
var rebootBtn = document.createElement('button');
|
||||||
|
rebootBtn.className = 'btn btn-primary btn-sm';
|
||||||
|
rebootBtn.style.marginLeft = '12px';
|
||||||
|
rebootBtn.textContent = 'Reboot now';
|
||||||
|
rebootBtn.onclick = function(){
|
||||||
|
fetch('/api/services/action', {method:'POST',headers:{'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify({name:'', action:'reboot'})});
|
||||||
|
};
|
||||||
|
status.appendChild(rebootBtn);
|
||||||
|
} else {
|
||||||
|
status.textContent = '✗ Installation failed: ' + e.data;
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
es.onerror = function() {
|
||||||
|
es.close();
|
||||||
|
status.textContent = '✗ Stream disconnected.';
|
||||||
|
status.style.color = 'var(--crit-fg)';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
installRefreshDisks();
|
||||||
|
</script>
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderInstall() string {
|
||||||
|
return `<div class="card"><div class="card-head">Install Live System to Disk</div><div class="card-body">` +
|
||||||
|
renderInstallInline() +
|
||||||
|
`</div></div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTasks() string {
|
||||||
|
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||||
|
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||||
|
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
|
||||||
|
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||||
|
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
var _taskRefreshTimer = null;
|
||||||
|
var _tasksAll = [];
|
||||||
|
var _taskPage = 1;
|
||||||
|
var _taskPageSize = 50;
|
||||||
|
|
||||||
|
function loadTasks() {
|
||||||
|
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||||
|
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||||
|
if (_tasksAll.length === 0) {
|
||||||
|
_taskPage = 1;
|
||||||
|
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||||
|
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||||
|
if (_taskPage < 1) _taskPage = 1;
|
||||||
|
const start = (_taskPage - 1) * _taskPageSize;
|
||||||
|
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||||
|
const rows = pageTasks.map(t => {
|
||||||
|
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||||
|
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||||
|
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||||
|
let actions = '<a class="btn btn-sm btn-secondary" href="/tasks/'+encodeURIComponent(t.id)+'">Open</a>';
|
||||||
|
if (t.status === 'running' || t.status === 'pending') {
|
||||||
|
actions += ' <button class="btn btn-sm btn-danger" onclick="cancelTask(\''+t.id+'\')">Cancel</button>';
|
||||||
|
}
|
||||||
|
if (t.status === 'pending') {
|
||||||
|
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',1)" title="Increase priority">⇧</button>';
|
||||||
|
actions += ' <button class="btn btn-sm btn-secondary" onclick="setPriority(\''+t.id+'\',-1)" title="Decrease priority">⇩</button>';
|
||||||
|
}
|
||||||
|
return '<tr><td><a href="/tasks/'+encodeURIComponent(t.id)+'">'+escHtml(t.name)+'</a></td>' +
|
||||||
|
'<td><span class="badge '+statusClass+'">'+statusLabel+'</span></td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+fmtTime(t.created_at)+'</td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+dur+'</td>' +
|
||||||
|
'<td>'+t.priority+'</td>' +
|
||||||
|
'<td>'+actions+'</td></tr>';
|
||||||
|
}).join('');
|
||||||
|
const showingFrom = start + 1;
|
||||||
|
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||||
|
const pager =
|
||||||
|
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||||
|
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||||
|
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||||
|
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||||
|
'</div>' +
|
||||||
|
'</div>';
|
||||||
|
document.getElementById('tasks-table').innerHTML =
|
||||||
|
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||||
|
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||||
|
function formatDurSec(sec) {
|
||||||
|
sec = Math.max(0, Math.round(sec||0));
|
||||||
|
if (sec < 60) return sec+'s';
|
||||||
|
const m = Math.floor(sec/60), ss = sec%60;
|
||||||
|
return m+'m '+ss+'s';
|
||||||
|
}
|
||||||
|
function setTaskPage(page) {
|
||||||
|
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||||
|
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||||
|
loadTasks();
|
||||||
|
}
|
||||||
|
|
||||||
|
function cancelTask(id) {
|
||||||
|
fetch('/api/tasks/'+id+'/cancel',{method:'POST'}).then(()=>loadTasks());
|
||||||
|
}
|
||||||
|
function cancelAll() {
|
||||||
|
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||||
|
}
|
||||||
|
function killWorkers() {
|
||||||
|
if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
|
||||||
|
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||||
|
.then(r=>r.json())
|
||||||
|
.then(d=>{
|
||||||
|
loadTasks();
|
||||||
|
var toast = document.getElementById('kill-toast');
|
||||||
|
var parts = [];
|
||||||
|
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||||
|
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||||
|
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||||
|
toast.style.display = '';
|
||||||
|
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function setPriority(id, delta) {
|
||||||
|
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||||
|
.then(()=>loadTasks());
|
||||||
|
}
|
||||||
|
|
||||||
|
loadTasks();
|
||||||
|
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
238
audit/internal/webui/page_metrics.go
Normal file
238
audit/internal/webui/page_metrics.go
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
func renderMetrics() string {
|
||||||
|
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds.</p>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Server — Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-load" data-chart-refresh="1" src="/api/metrics/chart/server-load.svg" style="width:100%;display:block;border-radius:6px" alt="CPU/Mem load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Temperature — CPU</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-temp-cpu" data-chart-refresh="1" src="/api/metrics/chart/server-temp-cpu.svg" style="width:100%;display:block;border-radius:6px" alt="CPU temperature">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Temperature — Ambient Sensors</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-temp-ambient" data-chart-refresh="1" src="/api/metrics/chart/server-temp-ambient.svg" style="width:100%;display:block;border-radius:6px" alt="Ambient temperature sensors">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Server — Power</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-power" data-chart-refresh="1" src="/api/metrics/chart/server-power.svg" style="width:100%;display:block;border-radius:6px" alt="System power">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="card-server-fans" class="card" style="margin-bottom:16px;display:none">
|
||||||
|
<div class="card-head">Server — Fan RPM</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-server-fans" data-chart-refresh="1" src="/api/metrics/chart/server-fans.svg" style="width:100%;display:block;border-radius:6px" alt="Fan RPM">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section id="gpu-metrics-section" style="display:none;margin-top:24px;padding:16px 16px 4px;border:1px solid #d7e0ea;border-radius:10px;background:linear-gradient(180deg,#f7fafc 0%,#eef4f8 100%)">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;gap:16px;flex-wrap:wrap;margin-bottom:14px">
|
||||||
|
<div>
|
||||||
|
<div style="font-size:12px;font-weight:700;letter-spacing:.08em;text-transform:uppercase;color:#486581">GPU Metrics</div>
|
||||||
|
<div id="gpu-metrics-summary" style="font-size:13px;color:var(--muted);margin-top:4px">Detected GPUs are rendered in a dedicated section.</div>
|
||||||
|
</div>
|
||||||
|
<label style="display:inline-flex;align-items:center;gap:8px;font-size:13px;color:var(--ink);font-weight:700;cursor:pointer">
|
||||||
|
<input id="gpu-chart-toggle" type="checkbox">
|
||||||
|
<span>One chart per GPU</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="gpu-metrics-by-metric">
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Compute Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-load" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-load.svg" style="width:100%;display:block;border-radius:6px" alt="GPU compute load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Memory Load</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-memload" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-memload.svg" style="width:100%;display:block;border-radius:6px" alt="GPU memory load">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Core Clock</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-clock" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-clock.svg" style="width:100%;display:block;border-radius:6px" alt="GPU core clock">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Power</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-power" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-power.svg" style="width:100%;display:block;border-radius:6px" alt="GPU power">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">GPU — Temperature</div>
|
||||||
|
<div class="card-body" style="padding:8px">
|
||||||
|
<img id="chart-gpu-all-temp" data-chart-refresh="1" src="/api/metrics/chart/gpu-all-temp.svg" style="width:100%;display:block;border-radius:6px" alt="GPU temperature">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="gpu-metrics-by-gpu" style="display:none"></div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
let gpuChartKey = '';
|
||||||
|
const gpuChartModeStorageKey = 'bee.metrics.gpuChartMode';
|
||||||
|
let metricsNvidiaGPUsPromise = null;
|
||||||
|
|
||||||
|
function loadMetricsNvidiaGPUs() {
|
||||||
|
if (!metricsNvidiaGPUsPromise) {
|
||||||
|
metricsNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||||
|
.then(function(r) {
|
||||||
|
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(function(list) { return Array.isArray(list) ? list : []; })
|
||||||
|
.catch(function() { return []; });
|
||||||
|
}
|
||||||
|
return metricsNvidiaGPUsPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
function metricsGPUNameMap(list) {
|
||||||
|
const out = {};
|
||||||
|
(list || []).forEach(function(gpu) {
|
||||||
|
const idx = Number(gpu.index);
|
||||||
|
if (!Number.isFinite(idx) || !gpu.name) return;
|
||||||
|
out[idx] = gpu.name;
|
||||||
|
});
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function metricsGPUDisplayLabel(idx, names) {
|
||||||
|
const name = names && names[idx];
|
||||||
|
return name ? ('GPU ' + idx + ' — ' + name) : ('GPU ' + idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadGPUChartModePreference() {
|
||||||
|
try {
|
||||||
|
return sessionStorage.getItem(gpuChartModeStorageKey) === 'per-gpu';
|
||||||
|
} catch (_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function saveGPUChartModePreference(perGPU) {
|
||||||
|
try {
|
||||||
|
sessionStorage.setItem(gpuChartModeStorageKey, perGPU ? 'per-gpu' : 'per-metric');
|
||||||
|
} catch (_) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
function refreshChartImage(el) {
|
||||||
|
if (!el || el.dataset.loading === '1') return;
|
||||||
|
if (el.offsetParent === null) return;
|
||||||
|
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||||
|
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||||
|
const probe = new Image();
|
||||||
|
el.dataset.baseSrc = baseSrc;
|
||||||
|
el.dataset.loading = '1';
|
||||||
|
probe.onload = function() {
|
||||||
|
el.src = nextSrc;
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.onerror = function() {
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.src = nextSrc;
|
||||||
|
}
|
||||||
|
|
||||||
|
function refreshCharts() {
|
||||||
|
document.querySelectorAll('img[data-chart-refresh="1"]').forEach(refreshChartImage);
|
||||||
|
}
|
||||||
|
|
||||||
|
function gpuIndices(rows) {
|
||||||
|
const seen = {};
|
||||||
|
const out = [];
|
||||||
|
(rows || []).forEach(function(row) {
|
||||||
|
const idx = Number(row.index);
|
||||||
|
if (!Number.isFinite(idx) || seen[idx]) return;
|
||||||
|
seen[idx] = true;
|
||||||
|
out.push(idx);
|
||||||
|
});
|
||||||
|
return out.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderGPUOverviewCards(indices, names) {
|
||||||
|
const host = document.getElementById('gpu-metrics-by-gpu');
|
||||||
|
if (!host) return;
|
||||||
|
host.innerHTML = indices.map(function(idx) {
|
||||||
|
const label = metricsGPUDisplayLabel(idx, names);
|
||||||
|
return '<div class="card" style="margin-bottom:16px">' +
|
||||||
|
'<div class="card-head">' + label + ' — Overview</div>' +
|
||||||
|
'<div class="card-body" style="padding:8px">' +
|
||||||
|
'<img id="chart-gpu-' + idx + '-overview" data-chart-refresh="1" src="/api/metrics/chart/gpu/' + idx + '-overview.svg" style="width:100%;display:block;border-radius:6px" alt="' + label + ' overview">' +
|
||||||
|
'</div></div>';
|
||||||
|
}).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function applyGPUChartMode() {
|
||||||
|
const perMetric = document.getElementById('gpu-metrics-by-metric');
|
||||||
|
const perGPU = document.getElementById('gpu-metrics-by-gpu');
|
||||||
|
const toggle = document.getElementById('gpu-chart-toggle');
|
||||||
|
const gpuModePerGPU = !!(toggle && toggle.checked);
|
||||||
|
if (perMetric) perMetric.style.display = gpuModePerGPU ? 'none' : '';
|
||||||
|
if (perGPU) perGPU.style.display = gpuModePerGPU ? '' : 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
function syncMetricsLayout(d) {
|
||||||
|
const fanCard = document.getElementById('card-server-fans');
|
||||||
|
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||||
|
const section = document.getElementById('gpu-metrics-section');
|
||||||
|
const summary = document.getElementById('gpu-metrics-summary');
|
||||||
|
const indices = gpuIndices(d.gpus);
|
||||||
|
loadMetricsNvidiaGPUs().then(function(gpus) {
|
||||||
|
const names = metricsGPUNameMap(gpus);
|
||||||
|
if (section) section.style.display = indices.length > 0 ? '' : 'none';
|
||||||
|
if (summary) {
|
||||||
|
summary.textContent = indices.length > 0
|
||||||
|
? ('Detected GPUs: ' + indices.map(function(idx) { return metricsGPUDisplayLabel(idx, names); }).join(', '))
|
||||||
|
: 'No GPUs detected in live metrics.';
|
||||||
|
}
|
||||||
|
const nextKey = indices.join(',') + '|' + indices.map(function(idx) { return names[idx] || ''; }).join(',');
|
||||||
|
if (nextKey !== gpuChartKey) {
|
||||||
|
renderGPUOverviewCards(indices, names);
|
||||||
|
gpuChartKey = nextKey;
|
||||||
|
}
|
||||||
|
applyGPUChartMode();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadMetricsLayout() {
|
||||||
|
fetch('/api/metrics/latest').then(function(r) { return r.json(); }).then(syncMetricsLayout).catch(function() {});
|
||||||
|
}
|
||||||
|
|
||||||
|
const gpuChartToggle = document.getElementById('gpu-chart-toggle');
|
||||||
|
if (gpuChartToggle) {
|
||||||
|
gpuChartToggle.checked = loadGPUChartModePreference();
|
||||||
|
}
|
||||||
|
applyGPUChartMode();
|
||||||
|
|
||||||
|
if (gpuChartToggle) {
|
||||||
|
gpuChartToggle.addEventListener('change', function() {
|
||||||
|
saveGPUChartModePreference(!!gpuChartToggle.checked);
|
||||||
|
applyGPUChartMode();
|
||||||
|
refreshCharts();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
loadMetricsLayout();
|
||||||
|
setInterval(refreshCharts, 3000);
|
||||||
|
setInterval(loadMetricsLayout, 5000);
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
213
audit/internal/webui/page_network_services.go
Normal file
213
audit/internal/webui/page_network_services.go
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import "html"
|
||||||
|
|
||||||
|
// renderNetworkInline returns the network UI without a wrapping card (for embedding in Tools).
|
||||||
|
func renderNetworkInline() string {
|
||||||
|
return `<div id="net-pending" style="display:none" class="alert alert-warn">
|
||||||
|
<strong>⚠ Network change applied.</strong> Reverting in <span id="net-countdown">60</span>s unless confirmed.
|
||||||
|
<button class="btn btn-primary btn-sm" style="margin-left:8px" onclick="confirmNetChange()">Confirm</button>
|
||||||
|
<button class="btn btn-secondary btn-sm" style="margin-left:4px" onclick="rollbackNetChange()">Rollback</button>
|
||||||
|
</div>
|
||||||
|
<div id="iface-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<div class="grid2" style="margin-top:16px">
|
||||||
|
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">DHCP</div>
|
||||||
|
<div class="form-row"><label>Interface (leave empty for all)</label><input type="text" id="dhcp-iface" placeholder="eth0"></div>
|
||||||
|
<button class="btn btn-primary" onclick="runDHCP()">▶ Run DHCP</button>
|
||||||
|
<div id="dhcp-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||||
|
</div>
|
||||||
|
<div><div style="font-weight:700;font-size:13px;margin-bottom:8px">Static IPv4</div>
|
||||||
|
<div class="form-row"><label>Interface</label><input type="text" id="st-iface" placeholder="eth0"></div>
|
||||||
|
<div class="form-row"><label>Address</label><input type="text" id="st-addr" placeholder="192.168.1.100"></div>
|
||||||
|
<div class="form-row"><label>Prefix length</label><input type="text" id="st-prefix" placeholder="24"></div>
|
||||||
|
<div class="form-row"><label>Gateway</label><input type="text" id="st-gw" placeholder="192.168.1.1"></div>
|
||||||
|
<div class="form-row"><label>DNS (comma-separated)</label><input type="text" id="st-dns" placeholder="8.8.8.8,8.8.4.4"></div>
|
||||||
|
<button class="btn btn-primary" onclick="setStatic()">Apply Static IP</button>
|
||||||
|
<div id="static-out" style="margin-top:10px;font-size:12px;color:var(--ok-fg)"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
var _netCountdownTimer = null;
|
||||||
|
var _netRefreshTimer = null;
|
||||||
|
const NET_ROLLBACK_SECS = 60;
|
||||||
|
function loadNetwork() {
|
||||||
|
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||||
|
const rows = (d.interfaces||[]).map(i =>
|
||||||
|
'<tr><td style="cursor:pointer" onclick="selectIface(\''+i.Name+'\')" title="Use this interface in the forms below"><span style="text-decoration:underline">'+i.Name+'</span></td>' +
|
||||||
|
'<td style="cursor:pointer" onclick="toggleIface(\''+i.Name+'\',\''+i.State+'\')" title="Click to toggle"><span class="badge '+(i.State==='up'?'badge-ok':'badge-warn')+'">'+i.State+'</span></td>' +
|
||||||
|
'<td>'+(i.IPv4||[]).join(', ')+'</td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('iface-table').innerHTML =
|
||||||
|
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||||
|
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||||
|
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
else hideNetPending();
|
||||||
|
}).catch(function() {});
|
||||||
|
}
|
||||||
|
function selectIface(iface) {
|
||||||
|
document.getElementById('dhcp-iface').value = iface;
|
||||||
|
document.getElementById('st-iface').value = iface;
|
||||||
|
}
|
||||||
|
function toggleIface(iface, currentState) {
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
|
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||||
|
loadNetwork();
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function hideNetPending() {
|
||||||
|
const el = document.getElementById('net-pending');
|
||||||
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
|
_netCountdownTimer = null;
|
||||||
|
el.style.display = 'none';
|
||||||
|
}
|
||||||
|
function showNetPending(secs) {
|
||||||
|
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||||
|
const el = document.getElementById('net-pending');
|
||||||
|
el.style.display = 'block';
|
||||||
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
|
let remaining = secs;
|
||||||
|
document.getElementById('net-countdown').textContent = remaining;
|
||||||
|
_netCountdownTimer = setInterval(function() {
|
||||||
|
remaining--;
|
||||||
|
document.getElementById('net-countdown').textContent = remaining;
|
||||||
|
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||||
|
}, 1000);
|
||||||
|
}
|
||||||
|
function confirmNetChange() {
|
||||||
|
hideNetPending();
|
||||||
|
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
|
}
|
||||||
|
function rollbackNetChange() {
|
||||||
|
hideNetPending();
|
||||||
|
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
|
}
|
||||||
|
function runDHCP() {
|
||||||
|
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
|
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function setStatic() {
|
||||||
|
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
|
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||||
|
interface: document.getElementById('st-iface').value,
|
||||||
|
address: document.getElementById('st-addr').value,
|
||||||
|
prefix: document.getElementById('st-prefix').value,
|
||||||
|
gateway: document.getElementById('st-gw').value,
|
||||||
|
dns: dns,
|
||||||
|
})}).then(r=>r.json()).then(d => {
|
||||||
|
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||||
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNetwork();
|
||||||
|
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||||
|
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNetwork() string {
|
||||||
|
return `<div class="card"><div class="card-head">Network Interfaces</div><div class="card-body">` +
|
||||||
|
renderNetworkInline() +
|
||||||
|
`</div></div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderServicesInline() string {
|
||||||
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:10px">` + html.EscapeString(`bee-selfheal.timer is expected to be active; the oneshot bee-selfheal.service itself is not shown as a long-running service.`) + `</p>
|
||||||
|
<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||||
|
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<div id="svc-out" style="display:none;margin-top:12px">
|
||||||
|
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||||
|
<span id="svc-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||||
|
<span id="svc-out-status" style="font-size:12px"></span>
|
||||||
|
</div>
|
||||||
|
<div id="svc-terminal" class="terminal" style="max-height:220px;width:100%;box-sizing:border-box"></div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function loadServices() {
|
||||||
|
fetch('/api/services').then(r=>r.json()).then(svcs => {
|
||||||
|
const rows = svcs.map(s => {
|
||||||
|
const st = s.state||'unknown';
|
||||||
|
const badge = st==='active' ? 'badge-ok' : st==='failed' ? 'badge-err' : 'badge-warn';
|
||||||
|
const id = 'svc-body-'+s.name.replace(/[^a-z0-9]/g,'-');
|
||||||
|
const body = (s.body||'').replace(/</g,'<').replace(/>/g,'>');
|
||||||
|
return '<tr>' +
|
||||||
|
'<td style="white-space:nowrap">'+s.name+'</td>' +
|
||||||
|
'<td style="white-space:nowrap"><span class="badge '+badge+'" style="cursor:pointer" onclick="toggleBody(\''+id+'\')">'+st+' ▾</span>' +
|
||||||
|
'<div id="'+id+'" style="display:none;margin-top:6px"><pre style="font-size:11px;white-space:pre-wrap;word-break:break-all;max-height:200px;overflow-y:auto;background:#1b1c1d;padding:8px;border-radius:4px;color:#b5cea8">'+body+'</pre></div>' +
|
||||||
|
'</td>' +
|
||||||
|
'<td style="white-space:nowrap">' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-start" onclick="svcAction(this,\''+s.name+'\',\'start\')">Start</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-stop" onclick="svcAction(this,\''+s.name+'\',\'stop\')">Stop</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" id="btn-'+s.name+'-restart" onclick="svcAction(this,\''+s.name+'\',\'restart\')">Restart</button>' +
|
||||||
|
'</td></tr>';
|
||||||
|
}).join('');
|
||||||
|
document.getElementById('svc-table').innerHTML =
|
||||||
|
'<table><tr><th>Unit</th><th>Status</th><th>Actions</th></tr>'+rows+'</table>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function toggleBody(id) {
|
||||||
|
const el = document.getElementById(id);
|
||||||
|
if (el) el.style.display = el.style.display==='none' ? 'block' : 'none';
|
||||||
|
}
|
||||||
|
function svcAction(btn, name, action) {
|
||||||
|
var label = btn.textContent;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = '...';
|
||||||
|
var out = document.getElementById('svc-out');
|
||||||
|
var term = document.getElementById('svc-terminal');
|
||||||
|
var statusEl = document.getElementById('svc-out-status');
|
||||||
|
var labelEl = document.getElementById('svc-out-label');
|
||||||
|
out.style.display = 'block';
|
||||||
|
labelEl.textContent = action + ' ' + name;
|
||||||
|
term.textContent = 'Running...';
|
||||||
|
statusEl.textContent = '';
|
||||||
|
statusEl.style.color = '';
|
||||||
|
fetch('/api/services/action',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({name,action})})
|
||||||
|
.then(r=>r.json()).then(d => {
|
||||||
|
term.textContent = d.output || d.error || '(no output)';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
if (d.status === 'ok') {
|
||||||
|
statusEl.textContent = '✓ done';
|
||||||
|
statusEl.style.color = 'var(--ok-fg, #2c662d)';
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = '✗ failed';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
}
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
|
setTimeout(loadServices, 800);
|
||||||
|
}).catch(e => {
|
||||||
|
term.textContent = 'Request failed: ' + e;
|
||||||
|
statusEl.textContent = '✗ error';
|
||||||
|
statusEl.style.color = 'var(--crit-fg, #9f3a38)';
|
||||||
|
btn.textContent = label;
|
||||||
|
btn.disabled = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadServices();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderServices() string {
|
||||||
|
return `<div class="card"><div class="card-head">Bee Services</div><div class="card-body">` +
|
||||||
|
renderServicesInline() +
|
||||||
|
`</div></div>`
|
||||||
|
}
|
||||||
663
audit/internal/webui/page_validate.go
Normal file
663
audit/internal/webui/page_validate.go
Normal file
@@ -0,0 +1,663 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
type validateInventory struct {
|
||||||
|
CPU string
|
||||||
|
Memory string
|
||||||
|
Storage string
|
||||||
|
NVIDIA string
|
||||||
|
AMD string
|
||||||
|
NvidiaGPUCount int
|
||||||
|
AMDGPUCount int
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateFmtDur(secs int) string {
|
||||||
|
if secs < 120 {
|
||||||
|
return fmt.Sprintf("~%d s", secs)
|
||||||
|
}
|
||||||
|
mins := (secs + 29) / 60
|
||||||
|
return fmt.Sprintf("~%d min", mins)
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateTotalValidateSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUValidateSec +
|
||||||
|
platform.SATEstimatedMemoryValidateSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUValidateSec
|
||||||
|
}
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateTotalStressSec(n int) int {
|
||||||
|
if n < 0 {
|
||||||
|
n = 0
|
||||||
|
}
|
||||||
|
total := platform.SATEstimatedCPUStressSec +
|
||||||
|
platform.SATEstimatedMemoryStressSec +
|
||||||
|
platform.SATEstimatedNvidiaPulseTestSec +
|
||||||
|
platform.SATEstimatedNvidiaInterconnectSec +
|
||||||
|
platform.SATEstimatedNvidiaBandwidthSec
|
||||||
|
if n > 0 {
|
||||||
|
total += platform.SATEstimatedNvidiaGPUStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedStressSec +
|
||||||
|
platform.SATEstimatedNvidiaTargetedPowerSec
|
||||||
|
}
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderValidate(opts HandlerOptions) string {
|
||||||
|
inv := loadValidateInventory(opts)
|
||||||
|
n := inv.NvidiaGPUCount
|
||||||
|
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||||
|
stressTotalStr := validateFmtDur(validateTotalStressSec(n))
|
||||||
|
gpuNote := ""
|
||||||
|
if n > 0 {
|
||||||
|
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||||
|
}
|
||||||
|
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||||
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">Validate Profile</div>
|
||||||
|
<div class="card-body validate-profile-body">
|
||||||
|
<div class="validate-profile-col">
|
||||||
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||||
|
</div>
|
||||||
|
<div class="validate-profile-col validate-profile-action">
|
||||||
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||||
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
|
<div style="margin-top:12px">
|
||||||
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid3">
|
||||||
|
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||||
|
inv.CPU,
|
||||||
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` in Validate (stress-ng 60 s). `+validateFmtDur(platform.SATEstimatedCPUStressSec)+` in Stress (stress-ng 30 min).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
|
inv.Memory,
|
||||||
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` in Validate (256 MB × 1 pass). `+validateFmtDur(platform.SATEstimatedMemoryStressSec)+` in Stress (512 MB × 1 pass).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
|
inv.Storage,
|
||||||
|
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||||
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
|
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||||
|
)) +
|
||||||
|
`</div>
|
||||||
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">NVIDIA GPU Selection</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
|
</div>
|
||||||
|
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid3">
|
||||||
|
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
|
fmt.Sprintf("Validate: %s (Level 2, all GPUs simultaneously). Stress: %s (Level 3, all GPUs simultaneously).",
|
||||||
|
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||||
|
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||||
|
)) +
|
||||||
|
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||||
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-targeted-power">` +
|
||||||
|
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
|
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-pulse">` +
|
||||||
|
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||||
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
|
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-interconnect">` +
|
||||||
|
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||||
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`<div id="sat-card-nvidia-bandwidth">` +
|
||||||
|
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
|
`<code>nvbandwidth</code>`,
|
||||||
|
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||||
|
)) +
|
||||||
|
`</div>` +
|
||||||
|
`</div>
|
||||||
|
<div class="grid3" style="margin-top:16px">
|
||||||
|
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||||
|
inv.AMD,
|
||||||
|
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||||
|
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||||
|
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||||
|
)) +
|
||||||
|
`</div>
|
||||||
|
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||||
|
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
<style>
|
||||||
|
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||||
|
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||||
|
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||||
|
.validate-card-body { padding:0; }
|
||||||
|
.validate-card-section { padding:12px 16px 0; }
|
||||||
|
.validate-card-section:last-child { padding-bottom:16px; }
|
||||||
|
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||||
|
</style>
|
||||||
|
<script>
|
||||||
|
let satES = null;
|
||||||
|
function satStressMode() {
|
||||||
|
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||||
|
}
|
||||||
|
function satModeChanged() {
|
||||||
|
const stress = satStressMode();
|
||||||
|
[
|
||||||
|
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||||
|
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||||
|
].forEach(function(item) {
|
||||||
|
const card = document.getElementById(item.card);
|
||||||
|
if (card) {
|
||||||
|
card.style.opacity = stress ? '1' : '0.5';
|
||||||
|
const hint = document.getElementById(item.hint);
|
||||||
|
if (hint) hint.style.display = stress ? 'none' : '';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function satLabels() {
|
||||||
|
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
|
}
|
||||||
|
let satNvidiaGPUsPromise = null;
|
||||||
|
function loadSatNvidiaGPUs() {
|
||||||
|
if (!satNvidiaGPUsPromise) {
|
||||||
|
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia')
|
||||||
|
.then(r => {
|
||||||
|
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(list => Array.isArray(list) ? list : []);
|
||||||
|
}
|
||||||
|
return satNvidiaGPUsPromise;
|
||||||
|
}
|
||||||
|
function satSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||||
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
|
.map(function(el) { return parseInt(el.value, 10); })
|
||||||
|
.filter(function(v) { return !Number.isNaN(v); })
|
||||||
|
.sort(function(a, b) { return a - b; });
|
||||||
|
}
|
||||||
|
function satUpdateGPUSelectionNote() {
|
||||||
|
const note = document.getElementById('sat-gpu-selection-note');
|
||||||
|
if (!note) return;
|
||||||
|
const selected = satSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests will use all selected GPUs.';
|
||||||
|
}
|
||||||
|
function satRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (!root) return;
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(function(gpu) {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="sat-gpu-row">'
|
||||||
|
+ '<input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()">'
|
||||||
|
+ '<span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span>'
|
||||||
|
+ '</label>';
|
||||||
|
}).join('');
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satSelectAllGPUs() {
|
||||||
|
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = true; });
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satSelectNoGPUs() {
|
||||||
|
document.querySelectorAll('.sat-nvidia-checkbox').forEach(function(el) { el.checked = false; });
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satLoadGPUs() {
|
||||||
|
loadSatNvidiaGPUs().then(function(gpus) {
|
||||||
|
satRenderGPUList(gpus);
|
||||||
|
}).catch(function(err) {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (root) {
|
||||||
|
root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
}
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function satGPUDisplayName(gpu) {
|
||||||
|
const idx = (gpu && Number.isFinite(Number(gpu.index))) ? Number(gpu.index) : 0;
|
||||||
|
const name = gpu && gpu.name ? gpu.name : ('GPU ' + idx);
|
||||||
|
return 'GPU ' + idx + ' — ' + name;
|
||||||
|
}
|
||||||
|
function satRequestBody(target, overrides) {
|
||||||
|
const body = {};
|
||||||
|
const labels = satLabels();
|
||||||
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
|
body.stress_mode = satStressMode();
|
||||||
|
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||||
|
if (overrides) {
|
||||||
|
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||||
|
}
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
function enqueueSATTarget(target, overrides) {
|
||||||
|
return fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(satRequestBody(target, overrides))})
|
||||||
|
.then(r => r.json());
|
||||||
|
}
|
||||||
|
function streamSATTask(taskId, title, resetTerminal) {
|
||||||
|
if (satES) { satES.close(); satES = null; }
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
if (resetTerminal) {
|
||||||
|
term.textContent = '';
|
||||||
|
}
|
||||||
|
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
|
return new Promise(function(resolve) {
|
||||||
|
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
satES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
satES.addEventListener('done', function(e) {
|
||||||
|
satES.close();
|
||||||
|
satES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: !e.data, error: e.data || ''});
|
||||||
|
});
|
||||||
|
satES.onerror = function() {
|
||||||
|
if (satES) {
|
||||||
|
satES.close();
|
||||||
|
satES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: false, error: 'stream disconnected'});
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function selectedAMDValidateTargets() {
|
||||||
|
const targets = [];
|
||||||
|
const gpu = document.getElementById('sat-amd-target');
|
||||||
|
const mem = document.getElementById('sat-amd-mem-target');
|
||||||
|
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||||
|
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||||
|
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||||
|
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||||
|
return targets;
|
||||||
|
}
|
||||||
|
function runSAT(target) {
|
||||||
|
return runSATWithOverrides(target, null);
|
||||||
|
}
|
||||||
|
function runSATWithOverrides(target, overrides) {
|
||||||
|
const title = (overrides && overrides.display_name) || target;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||||
|
return enqueueSATTarget(target, overrides)
|
||||||
|
.then(d => streamSATTask(d.task_id, title, false));
|
||||||
|
}
|
||||||
|
const nvidiaPerGPUTargets = [];
|
||||||
|
const nvidiaAllGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
|
function satAllGPUIndicesForMulti() {
|
||||||
|
return Promise.resolve(satSelectedGPUIndices());
|
||||||
|
}
|
||||||
|
function expandSATTarget(target) {
|
||||||
|
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||||
|
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||||
|
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||||
|
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||||
|
return Promise.resolve([{target: target}]);
|
||||||
|
}
|
||||||
|
const selected = satSelectedGPUIndices();
|
||||||
|
if (!selected.length) {
|
||||||
|
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||||
|
}
|
||||||
|
return loadSatNvidiaGPUs().then(gpus => gpus.filter(gpu => selected.indexOf(Number(gpu.index)) >= 0).map(gpu => ({
|
||||||
|
target: target,
|
||||||
|
overrides: {
|
||||||
|
gpu_indices: [Number(gpu.index)],
|
||||||
|
display_name: (satLabels()[target] || ('Validate ' + target)) + ' (' + satGPUDisplayName(gpu) + ')'
|
||||||
|
},
|
||||||
|
label: satGPUDisplayName(gpu),
|
||||||
|
})));
|
||||||
|
}
|
||||||
|
function runNvidiaFabricValidate(target) {
|
||||||
|
satAllGPUIndicesForMulti().then(function(indices) {
|
||||||
|
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||||
|
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function runNvidiaValidateSet(target) {
|
||||||
|
const selected = satSelectedGPUIndices();
|
||||||
|
if (!selected.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||||
|
return runSATWithOverrides(target, {gpu_indices: selected, display_name: satLabels()[target] || target});
|
||||||
|
}
|
||||||
|
function runAMDValidateSet() {
|
||||||
|
const targets = selectedAMDValidateTargets();
|
||||||
|
if (!targets.length) return;
|
||||||
|
if (targets.length === 1) return runSAT(targets[0]);
|
||||||
|
document.getElementById('sat-output').style.display='block';
|
||||||
|
document.getElementById('sat-title').textContent = '— amd';
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
term.textContent = 'Running AMD validate set one by one...\n';
|
||||||
|
const labels = satLabels();
|
||||||
|
const runNext = (idx) => {
|
||||||
|
if (idx >= targets.length) return Promise.resolve();
|
||||||
|
const target = targets[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[target] + '\n';
|
||||||
|
return enqueueSATTarget(target)
|
||||||
|
.then(d => {
|
||||||
|
return streamSATTask(d.task_id, labels[target], false);
|
||||||
|
}).then(function() {
|
||||||
|
return runNext(idx + 1);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
}
|
||||||
|
function runAllSAT() {
|
||||||
|
const cycles = 1;
|
||||||
|
const status = document.getElementById('sat-all-status');
|
||||||
|
status.textContent = 'Enqueuing...';
|
||||||
|
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse'];
|
||||||
|
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||||
|
const activeTargets = baseTargets.filter(target => {
|
||||||
|
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||||
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
|
return !(btn && btn.disabled);
|
||||||
|
});
|
||||||
|
Promise.all(activeTargets.map(expandSATTarget)).then(groups => {
|
||||||
|
const expanded = [];
|
||||||
|
for (let cycle = 0; cycle < cycles; cycle++) {
|
||||||
|
groups.forEach(group => group.forEach(item => expanded.push(item)));
|
||||||
|
}
|
||||||
|
const total = expanded.length;
|
||||||
|
let enqueued = 0;
|
||||||
|
if (!total) {
|
||||||
|
status.textContent = 'No tasks selected.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const runNext = (idx) => {
|
||||||
|
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||||
|
const item = expanded[idx];
|
||||||
|
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||||
|
return enqueueSATTarget(item.target, item.overrides)
|
||||||
|
.then(() => {
|
||||||
|
enqueued++;
|
||||||
|
return runNext(idx + 1);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
}).catch(err => {
|
||||||
|
status.textContent = 'Error: ' + err.message;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
<script>
|
||||||
|
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||||
|
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||||
|
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||||
|
});
|
||||||
|
satLoadGPUs();
|
||||||
|
function disableSATAMDOptions(reason) {
|
||||||
|
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(function(id) {
|
||||||
|
const cb = document.getElementById(id);
|
||||||
|
if (!cb) return;
|
||||||
|
cb.disabled = true;
|
||||||
|
cb.checked = false;
|
||||||
|
cb.title = reason;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function disableSATCard(id, reason) {
|
||||||
|
const btn = document.getElementById('sat-btn-' + id);
|
||||||
|
if (!btn) return;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.title = reason;
|
||||||
|
btn.style.opacity = '0.4';
|
||||||
|
const card = btn.closest('.card');
|
||||||
|
if (card) {
|
||||||
|
let note = card.querySelector('.sat-unavail');
|
||||||
|
if (!note) {
|
||||||
|
note = document.createElement('p');
|
||||||
|
note.className = 'sat-unavail';
|
||||||
|
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||||
|
const body = card.querySelector('.card-body');
|
||||||
|
if (body) body.insertBefore(note, body.firstChild);
|
||||||
|
}
|
||||||
|
note.textContent = reason;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadValidateInventory(opts HandlerOptions) validateInventory {
|
||||||
|
unknown := "Audit snapshot not loaded."
|
||||||
|
out := validateInventory{
|
||||||
|
CPU: unknown,
|
||||||
|
Memory: unknown,
|
||||||
|
Storage: unknown,
|
||||||
|
NVIDIA: unknown,
|
||||||
|
AMD: unknown,
|
||||||
|
}
|
||||||
|
data, err := loadSnapshot(opts.AuditPath)
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
var snap schema.HardwareIngestRequest
|
||||||
|
if err := json.Unmarshal(data, &snap); err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
cpuCounts := map[string]int{}
|
||||||
|
cpuTotal := 0
|
||||||
|
for _, cpu := range snap.Hardware.CPUs {
|
||||||
|
if cpu.Present != nil && !*cpu.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cpuTotal++
|
||||||
|
addValidateModel(cpuCounts, validateFirstNonEmpty(validateTrimPtr(cpu.Model), validateTrimPtr(cpu.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
|
||||||
|
memCounts := map[string]int{}
|
||||||
|
memTotal := 0
|
||||||
|
for _, dimm := range snap.Hardware.Memory {
|
||||||
|
if dimm.Present != nil && !*dimm.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
memTotal++
|
||||||
|
addValidateModel(memCounts, validateFirstNonEmpty(validateTrimPtr(dimm.PartNumber), validateTrimPtr(dimm.Type), validateTrimPtr(dimm.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
|
||||||
|
storageCounts := map[string]int{}
|
||||||
|
storageTotal := 0
|
||||||
|
for _, dev := range snap.Hardware.Storage {
|
||||||
|
if dev.Present != nil && !*dev.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
storageTotal++
|
||||||
|
addValidateModel(storageCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
|
||||||
|
nvidiaCounts := map[string]int{}
|
||||||
|
nvidiaTotal := 0
|
||||||
|
amdCounts := map[string]int{}
|
||||||
|
amdTotal := 0
|
||||||
|
for _, dev := range snap.Hardware.PCIeDevices {
|
||||||
|
if dev.Present != nil && !*dev.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if validateIsVendorGPU(dev, "nvidia") {
|
||||||
|
nvidiaTotal++
|
||||||
|
addValidateModel(nvidiaCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
if validateIsVendorGPU(dev, "amd") {
|
||||||
|
amdTotal++
|
||||||
|
addValidateModel(amdCounts, validateFirstNonEmpty(validateTrimPtr(dev.Model), validateTrimPtr(dev.Manufacturer), "unknown"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out.CPU = formatValidateDeviceSummary(cpuTotal, cpuCounts, "CPU")
|
||||||
|
out.Memory = formatValidateDeviceSummary(memTotal, memCounts, "module")
|
||||||
|
out.Storage = formatValidateDeviceSummary(storageTotal, storageCounts, "device")
|
||||||
|
out.NVIDIA = formatValidateDeviceSummary(nvidiaTotal, nvidiaCounts, "GPU")
|
||||||
|
out.AMD = formatValidateDeviceSummary(amdTotal, amdCounts, "GPU")
|
||||||
|
out.NvidiaGPUCount = nvidiaTotal
|
||||||
|
out.AMDGPUCount = amdTotal
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderValidateCardBody(devices, description, commands, settings string) string {
|
||||||
|
return `<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + devices + `</div></div>` +
|
||||||
|
`<div class="validate-card-section"><div style="font-size:13px">` + description + `</div></div>` +
|
||||||
|
`<div class="validate-card-section"><div style="font-size:13px">` + commands + `</div></div>` +
|
||||||
|
`<div class="validate-card-section"><div style="font-size:13px;color:var(--muted)">` + settings + `</div></div>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatValidateDeviceSummary(total int, models map[string]int, unit string) string {
|
||||||
|
if total == 0 {
|
||||||
|
return "0 " + unit + "s detected."
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(models))
|
||||||
|
for key := range models {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
parts := make([]string, 0, len(keys))
|
||||||
|
for _, key := range keys {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x %s", models[key], html.EscapeString(key)))
|
||||||
|
}
|
||||||
|
label := unit
|
||||||
|
if total != 1 {
|
||||||
|
label += "s"
|
||||||
|
}
|
||||||
|
if len(parts) == 1 {
|
||||||
|
return parts[0] + " " + label
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||||
|
}
|
||||||
|
|
||||||
|
func addValidateModel(counts map[string]int, name string) {
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if name == "" {
|
||||||
|
name = "unknown"
|
||||||
|
}
|
||||||
|
counts[name]++
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateTrimPtr(value *string) string {
|
||||||
|
if value == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(*value)
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateFirstNonEmpty(values ...string) string {
|
||||||
|
for _, value := range values {
|
||||||
|
value = strings.TrimSpace(value)
|
||||||
|
if value != "" {
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||||
|
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||||
|
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||||
|
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||||
|
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
switch vendor {
|
||||||
|
case "nvidia":
|
||||||
|
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||||
|
case "amd":
|
||||||
|
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||||
|
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||||
|
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||||
|
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||||
|
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||||
|
if strings.TrimSpace(headerActions) != "" {
|
||||||
|
actions += headerActions
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(`<div class="card"><div class="card-head card-head-actions"><span>%s</span><div class="card-head-buttons">%s</div></div><div class="card-body validate-card-body">%s</div></div>`,
|
||||||
|
label, actions, body)
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
41
audit/internal/webui/serial_console.go
Normal file
41
audit/internal/webui/serial_console.go
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var taskSerialWriteLine = writeTaskSerialLine
|
||||||
|
|
||||||
|
func writeTaskSerialLine(line string) {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
|
||||||
|
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
|
||||||
|
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, _ = f.WriteString(payload)
|
||||||
|
_ = f.Close()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskSerialPrefix(t *Task) string {
|
||||||
|
if t == nil {
|
||||||
|
return "[task] "
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskSerialEvent(t *Task, event string) {
|
||||||
|
if t == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
|
||||||
|
}
|
||||||
@@ -1,15 +1,19 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"html"
|
"html"
|
||||||
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"mime"
|
"mime"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"runtime/debug"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -18,7 +22,6 @@ import (
|
|||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
"bee/audit/internal/runtimeenv"
|
"bee/audit/internal/runtimeenv"
|
||||||
gocharts "github.com/go-analyze/charts"
|
|
||||||
"reanimator/chart/viewer"
|
"reanimator/chart/viewer"
|
||||||
"reanimator/chart/web"
|
"reanimator/chart/web"
|
||||||
)
|
)
|
||||||
@@ -132,6 +135,14 @@ type namedMetricsRing struct {
|
|||||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||||
const metricsChartWindow = 360
|
const metricsChartWindow = 360
|
||||||
|
|
||||||
|
// metricsDownsampleAge is the age after which old metrics rows are downsampled
|
||||||
|
// to 1 sample per minute. Data fresher than this is kept at full resolution.
|
||||||
|
const metricsDownsampleAge = 2 * time.Hour
|
||||||
|
|
||||||
|
// metricsRetainWindow is the total retention period for metrics rows.
|
||||||
|
// Rows older than this are deleted entirely by the background compactor.
|
||||||
|
const metricsRetainWindow = 48 * time.Hour
|
||||||
|
|
||||||
var metricsCollectInterval = 5 * time.Second
|
var metricsCollectInterval = 5 * time.Second
|
||||||
|
|
||||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||||
@@ -218,6 +229,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// ── Infrastructure ──────────────────────────────────────────────────────
|
// ── Infrastructure ──────────────────────────────────────────────────────
|
||||||
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
||||||
mux.HandleFunc("GET /api/ready", h.handleReady)
|
mux.HandleFunc("GET /api/ready", h.handleReady)
|
||||||
|
mux.HandleFunc("GET /loading", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(loadingPageHTML))
|
||||||
|
})
|
||||||
|
|
||||||
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
||||||
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
||||||
@@ -234,6 +250,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
|
|
||||||
// SAT
|
// SAT
|
||||||
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
|
||||||
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||||
@@ -247,6 +269,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
|
mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
|
||||||
|
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
|
||||||
|
mux.HandleFunc("POST /api/bee-bench/nvidia/autotune/run", h.handleAPIBenchmarkAutotuneRun())
|
||||||
|
mux.HandleFunc("GET /api/bee-bench/nvidia/autotune/status", h.handleAPIBenchmarkAutotuneStatus)
|
||||||
|
mux.HandleFunc("GET /api/benchmark/results", h.handleAPIBenchmarkResults)
|
||||||
|
|
||||||
// Tasks
|
// Tasks
|
||||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||||
@@ -255,6 +282,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
||||||
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
||||||
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
||||||
|
mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
|
||||||
|
mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
|
||||||
|
mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
|
||||||
|
|
||||||
// Services
|
// Services
|
||||||
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
|
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
|
||||||
@@ -271,18 +301,18 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Export
|
// Export
|
||||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
|
||||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
|
||||||
|
mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)
|
||||||
|
|
||||||
// Tools
|
// Tools
|
||||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
|
||||||
// Display
|
|
||||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
|
||||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
|
||||||
|
|
||||||
// GPU presence / tools
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
|
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||||
|
mux.HandleFunc("GET /api/gpu/nvidia-status", h.handleAPIGNVIDIAGPUStatuses)
|
||||||
|
mux.HandleFunc("POST /api/gpu/nvidia-reset", h.handleAPIGNVIDIAReset)
|
||||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||||
|
|
||||||
// System
|
// System
|
||||||
@@ -309,22 +339,33 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("GET /", h.handlePage)
|
mux.HandleFunc("GET /", h.handlePage)
|
||||||
|
|
||||||
h.mux = mux
|
h.mux = mux
|
||||||
return mux
|
return recoverMiddleware(mux)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) startMetricsCollector() {
|
func (h *handler) startMetricsCollector() {
|
||||||
go func() {
|
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||||
ticker := time.NewTicker(metricsCollectInterval)
|
ticker := time.NewTicker(metricsCollectInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
pruneTicker := time.NewTicker(time.Hour)
|
||||||
sample := platform.SampleLiveMetrics()
|
defer pruneTicker.Stop()
|
||||||
if h.metricsDB != nil {
|
for {
|
||||||
_ = h.metricsDB.Write(sample)
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := platform.SampleLiveMetrics()
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
_ = h.metricsDB.Write(sample)
|
||||||
|
}
|
||||||
|
h.feedRings(sample)
|
||||||
|
h.setLatestMetric(sample)
|
||||||
|
case <-pruneTicker.C:
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
|
||||||
|
_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
h.feedRings(sample)
|
|
||||||
h.setLatestMetric(sample)
|
|
||||||
}
|
}
|
||||||
}()
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||||
@@ -345,7 +386,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
|||||||
|
|
||||||
// ListenAndServe starts the HTTP server.
|
// ListenAndServe starts the HTTP server.
|
||||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||||
return http.ListenAndServe(addr, NewHandler(opts))
|
srv := &http.Server{
|
||||||
|
Addr: addr,
|
||||||
|
Handler: NewHandler(opts),
|
||||||
|
ReadHeaderTimeout: 5 * time.Second,
|
||||||
|
ReadTimeout: 30 * time.Second,
|
||||||
|
IdleTimeout: 2 * time.Minute,
|
||||||
|
}
|
||||||
|
return srv.ListenAndServe()
|
||||||
|
}
|
||||||
|
|
||||||
|
type trackingResponseWriter struct {
|
||||||
|
http.ResponseWriter
|
||||||
|
wroteHeader bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) WriteHeader(statusCode int) {
|
||||||
|
w.wroteHeader = true
|
||||||
|
w.ResponseWriter.WriteHeader(statusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Write(p []byte) (int, error) {
|
||||||
|
w.wroteHeader = true
|
||||||
|
return w.ResponseWriter.Write(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Flush() {
|
||||||
|
w.wroteHeader = true
|
||||||
|
if f, ok := w.ResponseWriter.(http.Flusher); ok {
|
||||||
|
f.Flush()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
|
||||||
|
h, ok := w.ResponseWriter.(http.Hijacker)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil, fmt.Errorf("hijacking not supported")
|
||||||
|
}
|
||||||
|
return h.Hijack()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
|
||||||
|
p, ok := w.ResponseWriter.(http.Pusher)
|
||||||
|
if !ok {
|
||||||
|
return http.ErrNotSupported
|
||||||
|
}
|
||||||
|
return p.Push(target, opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
|
||||||
|
rf, ok := w.ResponseWriter.(io.ReaderFrom)
|
||||||
|
if !ok {
|
||||||
|
return io.Copy(w.ResponseWriter, r)
|
||||||
|
}
|
||||||
|
w.wroteHeader = true
|
||||||
|
return rf.ReadFrom(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
func recoverMiddleware(next http.Handler) http.Handler {
|
||||||
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
tw := &trackingResponseWriter{ResponseWriter: w}
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
slog.Error("http handler panic",
|
||||||
|
"method", r.Method,
|
||||||
|
"path", r.URL.Path,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
if !tw.wroteHeader {
|
||||||
|
http.Error(tw, "internal server error", http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
next.ServeHTTP(tw, r)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Infrastructure handlers ──────────────────────────────────────────────────
|
// ── Infrastructure handlers ──────────────────────────────────────────────────
|
||||||
@@ -475,13 +590,60 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
|
samples, err := h.metricsDB.LoadAll()
|
||||||
|
if err != nil || len(samples) == 0 {
|
||||||
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
timeline := metricsTimelineSegments(samples, time.Now())
|
||||||
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
|
var overviewOk bool
|
||||||
|
var buf []byte
|
||||||
|
buf, overviewOk, err = renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !overviewOk {
|
||||||
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
var buf []byte
|
||||||
|
if stacked {
|
||||||
|
buf, err = renderStackedMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
buf, err = renderMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMin,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -491,20 +653,8 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) (datasets [][]float64, names []string, labels []string, title string, yMin, yMax *float64, stacked bool, ok bool) {
|
||||||
samples, err := h.metricsDB.LoadAll()
|
labels = sampleTimeLabels(samples)
|
||||||
if err != nil || len(samples) == 0 {
|
|
||||||
return nil, nil, nil, "", nil, nil, false
|
|
||||||
}
|
|
||||||
return chartDataFromSamples(path, samples)
|
|
||||||
}
|
|
||||||
|
|
||||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
|
||||||
var datasets [][]float64
|
|
||||||
var names []string
|
|
||||||
var title string
|
|
||||||
var yMin, yMax *float64
|
|
||||||
labels := sampleTimeLabels(samples)
|
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case path == "server-load":
|
case path == "server-load":
|
||||||
@@ -541,12 +691,19 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
case path == "server-power":
|
case path == "server-power":
|
||||||
title = "System Power"
|
title = "System Power"
|
||||||
power := make([]float64, len(samples))
|
power := make([]float64, len(samples))
|
||||||
|
label := "Power W"
|
||||||
for i, s := range samples {
|
for i, s := range samples {
|
||||||
power[i] = s.PowerW
|
power[i] = s.PowerW
|
||||||
|
if strings.TrimSpace(s.PowerSource) != "" {
|
||||||
|
label = fmt.Sprintf("Power W · %s", s.PowerSource)
|
||||||
|
if strings.TrimSpace(s.PowerMode) != "" {
|
||||||
|
label += fmt.Sprintf(" (%s)", s.PowerMode)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
power = normalizePowerSeries(power)
|
power = normalizePowerSeries(power)
|
||||||
datasets = [][]float64{power}
|
datasets = [][]float64{power}
|
||||||
names = []string{"Power W"}
|
names = []string{label}
|
||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(power)
|
yMax = autoMax120(power)
|
||||||
|
|
||||||
@@ -578,42 +735,66 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(datasets...)
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-clock":
|
||||||
|
title = "GPU Core Clock"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-memclock":
|
||||||
|
title = "GPU Memory Clock"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
case strings.HasPrefix(path, "gpu/"):
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
rest := strings.TrimPrefix(path, "gpu/")
|
idx, sub, ok := parseGPUChartPath(path)
|
||||||
sub := ""
|
if !ok {
|
||||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
sub = rest[i+1:]
|
|
||||||
rest = rest[:i]
|
|
||||||
}
|
}
|
||||||
idx := 0
|
|
||||||
fmt.Sscanf(rest, "%d", &idx)
|
|
||||||
switch sub {
|
switch sub {
|
||||||
case "load":
|
case "load":
|
||||||
title = fmt.Sprintf("GPU %d Load", idx)
|
title = gpuDisplayLabel(idx) + " Load"
|
||||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
if util == nil && mem == nil {
|
if util == nil && mem == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||||
names = []string{"Load %", "Mem %"}
|
names = []string{"Load %", "Mem %"}
|
||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = floatPtr(100)
|
yMax = floatPtr(100)
|
||||||
case "temp":
|
case "temp":
|
||||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
title = gpuDisplayLabel(idx) + " Temperature"
|
||||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
if temp == nil {
|
if temp == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{temp}
|
datasets = [][]float64{temp}
|
||||||
names = []string{"Temp °C"}
|
names = []string{"Temp °C"}
|
||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(temp)
|
yMax = autoMax120(temp)
|
||||||
|
case "clock":
|
||||||
|
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||||
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
|
if clock == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{clock}
|
||||||
|
names = []string{"Core Clock MHz"}
|
||||||
|
yMin, yMax = autoBounds120(clock)
|
||||||
|
case "memclock":
|
||||||
|
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||||
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
|
if clock == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{clock}
|
||||||
|
names = []string{"Memory Clock MHz"}
|
||||||
|
yMin, yMax = autoBounds120(clock)
|
||||||
default:
|
default:
|
||||||
title = fmt.Sprintf("GPU %d Power", idx)
|
title = gpuDisplayLabel(idx) + " Power"
|
||||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
if power == nil {
|
if power == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
datasets = [][]float64{power}
|
datasets = [][]float64{power}
|
||||||
names = []string{"Power W"}
|
names = []string{"Power W"}
|
||||||
@@ -621,10 +802,30 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
}
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false, false
|
||||||
}
|
}
|
||||||
|
|
||||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
return datasets, names, labels, title, yMin, yMax, stacked, len(datasets) > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
||||||
|
if !strings.HasPrefix(path, "gpu/") {
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
rest := strings.TrimPrefix(path, "gpu/")
|
||||||
|
if rest == "" {
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
sub = ""
|
||||||
|
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||||
|
sub = rest[i+1:]
|
||||||
|
rest = rest[:i]
|
||||||
|
}
|
||||||
|
n, err := fmt.Sscanf(rest, "%d", &idx)
|
||||||
|
if err != nil || n != 1 {
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
return idx, sub, true
|
||||||
}
|
}
|
||||||
|
|
||||||
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
||||||
@@ -719,7 +920,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
datasets = append(datasets, ds)
|
datasets = append(datasets, ds)
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
names = append(names, gpuDisplayLabel(idx))
|
||||||
}
|
}
|
||||||
return datasets, names
|
return datasets, names
|
||||||
}
|
}
|
||||||
@@ -770,6 +971,37 @@ func normalizePowerSeries(ds []float64) []float64 {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// psuSlotsFromSamples returns the sorted list of PSU slot numbers seen across samples.
|
||||||
|
func psuSlotsFromSamples(samples []platform.LiveMetricSample) []int {
|
||||||
|
seen := map[int]struct{}{}
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, p := range s.PSUs {
|
||||||
|
seen[p.Slot] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slots := make([]int, 0, len(seen))
|
||||||
|
for s := range seen {
|
||||||
|
slots = append(slots, s)
|
||||||
|
}
|
||||||
|
sort.Ints(slots)
|
||||||
|
return slots
|
||||||
|
}
|
||||||
|
|
||||||
|
// psuStackedTotal returns the point-by-point sum of all PSU datasets (for scale calculation).
|
||||||
|
func psuStackedTotal(datasets [][]float64) []float64 {
|
||||||
|
if len(datasets) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n := len(datasets[0])
|
||||||
|
total := make([]float64, n)
|
||||||
|
for _, ds := range datasets {
|
||||||
|
for i, v := range ds {
|
||||||
|
total[i] += v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total
|
||||||
|
}
|
||||||
|
|
||||||
func normalizeFanSeries(ds []float64) []float64 {
|
func normalizeFanSeries(ds []float64) []float64 {
|
||||||
if len(ds) == 0 {
|
if len(ds) == 0 {
|
||||||
return nil
|
return nil
|
||||||
@@ -852,64 +1084,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
|
|||||||
return floatPtr(low), floatPtr(high)
|
return floatPtr(low), floatPtr(high)
|
||||||
}
|
}
|
||||||
|
|
||||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
func gpuChartLabelIndices(total, target int) []int {
|
||||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
if total <= 0 {
|
||||||
n := len(labels)
|
return nil
|
||||||
if n == 0 {
|
|
||||||
n = 1
|
|
||||||
labels = []string{""}
|
|
||||||
}
|
}
|
||||||
for i := range datasets {
|
if total == 1 {
|
||||||
if len(datasets[i]) == 0 {
|
return []int{0}
|
||||||
datasets[i] = make([]float64, n)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Append global min/avg/max to title.
|
step := total / target
|
||||||
mn, avg, mx := globalStats(datasets)
|
if step < 1 {
|
||||||
if mx > 0 {
|
step = 1
|
||||||
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
|
|
||||||
title,
|
|
||||||
chartLegendNumber(mn),
|
|
||||||
chartLegendNumber(avg),
|
|
||||||
chartLegendNumber(mx),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
title = sanitizeChartText(title)
|
var indices []int
|
||||||
names = sanitizeChartTexts(names)
|
for i := 0; i < total; i += step {
|
||||||
sparse := sanitizeChartTexts(sparseLabels(labels, 6))
|
indices = append(indices, i)
|
||||||
|
}
|
||||||
|
if indices[len(indices)-1] != total-1 {
|
||||||
|
indices = append(indices, total-1)
|
||||||
|
}
|
||||||
|
return indices
|
||||||
|
}
|
||||||
|
|
||||||
opt := gocharts.NewLineChartOptionWithData(datasets)
|
func chartCanvasHeightForPath(path string, seriesCount int) int {
|
||||||
opt.Title = gocharts.TitleOption{Text: title}
|
height := chartCanvasHeight(seriesCount)
|
||||||
opt.XAxis.Labels = sparse
|
if isGPUChartPath(path) {
|
||||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
return height * 2
|
||||||
if chartLegendVisible(len(names)) {
|
|
||||||
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
|
|
||||||
opt.Legend.OverlayChart = gocharts.Ptr(false)
|
|
||||||
} else {
|
|
||||||
opt.Legend.Show = gocharts.Ptr(false)
|
|
||||||
}
|
|
||||||
opt.Symbol = gocharts.SymbolNone
|
|
||||||
// Right padding: reserve space for the MarkLine label (library recommendation).
|
|
||||||
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
|
||||||
if yMin != nil || yMax != nil {
|
|
||||||
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
|
|
||||||
}
|
}
|
||||||
|
return height
|
||||||
|
}
|
||||||
|
|
||||||
// Add a single peak mark line on the series that holds the global maximum.
|
func isGPUChartPath(path string) bool {
|
||||||
peakIdx, _ := globalPeakSeries(datasets)
|
return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
|
||||||
if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
|
|
||||||
opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
|
|
||||||
}
|
|
||||||
|
|
||||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
|
||||||
OutputFormat: gocharts.ChartOutputSVG,
|
|
||||||
Width: 1400,
|
|
||||||
Height: chartCanvasHeight(len(names)),
|
|
||||||
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
|
||||||
if err := p.LineChart(opt); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return p.Bytes()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func chartLegendVisible(seriesCount int) bool {
|
func chartLegendVisible(seriesCount int) bool {
|
||||||
@@ -923,30 +1128,6 @@ func chartCanvasHeight(seriesCount int) int {
|
|||||||
return 288
|
return 288
|
||||||
}
|
}
|
||||||
|
|
||||||
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
|
|
||||||
return gocharts.YAxisOption{
|
|
||||||
Min: yMin,
|
|
||||||
Max: yMax,
|
|
||||||
LabelCount: 11,
|
|
||||||
ValueFormatter: chartYAxisNumber,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// globalPeakSeries returns the index of the series containing the global maximum
|
|
||||||
// value across all datasets, and that maximum value.
|
|
||||||
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
|
||||||
idx = -1
|
|
||||||
for i, ds := range datasets {
|
|
||||||
for _, v := range ds {
|
|
||||||
if v > peak {
|
|
||||||
peak = v
|
|
||||||
idx = i
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return idx, peak
|
|
||||||
}
|
|
||||||
|
|
||||||
// globalStats returns min, average, and max across all values in all datasets.
|
// globalStats returns min, average, and max across all values in all datasets.
|
||||||
func globalStats(datasets [][]float64) (mn, avg, mx float64) {
|
func globalStats(datasets [][]float64) (mn, avg, mx float64) {
|
||||||
var sum float64
|
var sum float64
|
||||||
@@ -986,21 +1167,6 @@ func sanitizeChartText(s string) string {
|
|||||||
}, s))
|
}, s))
|
||||||
}
|
}
|
||||||
|
|
||||||
func sanitizeChartTexts(in []string) []string {
|
|
||||||
out := make([]string, len(in))
|
|
||||||
for i, s := range in {
|
|
||||||
out[i] = sanitizeChartText(s)
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func safeIdx(s []float64, i int) float64 {
|
|
||||||
if i < len(s) {
|
|
||||||
return s[i]
|
|
||||||
}
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
|
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
|
||||||
var datasets [][]float64
|
var datasets [][]float64
|
||||||
var names []string
|
var names []string
|
||||||
@@ -1087,20 +1253,6 @@ func chartYAxisNumber(v float64) string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func sparseLabels(labels []string, n int) []string {
|
|
||||||
out := make([]string, len(labels))
|
|
||||||
step := len(labels) / n
|
|
||||||
if step < 1 {
|
|
||||||
step = 1
|
|
||||||
}
|
|
||||||
for i, l := range labels {
|
|
||||||
if i%step == 0 {
|
|
||||||
out[i] = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
|
||||||
if h.metricsDB == nil {
|
if h.metricsDB == nil {
|
||||||
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||||
@@ -1116,6 +1268,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
|
|||||||
|
|
||||||
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
if strings.TrimSpace(h.opts.AuditPath) == "" {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, _ = w.Write([]byte("ready"))
|
||||||
|
return
|
||||||
|
}
|
||||||
if _, err := os.Stat(h.opts.AuditPath); err != nil {
|
if _, err := os.Stat(h.opts.AuditPath); err != nil {
|
||||||
w.WriteHeader(http.StatusServiceUnavailable)
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
_, _ = w.Write([]byte("starting"))
|
_, _ = w.Write([]byte("starting"))
|
||||||
@@ -1129,37 +1286,106 @@ const loadingPageHTML = `<!DOCTYPE html>
|
|||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>EASY-BEE</title>
|
<title>EASY-BEE — Starting</title>
|
||||||
<style>
|
<style>
|
||||||
*{margin:0;padding:0;box-sizing:border-box}
|
*{margin:0;padding:0;box-sizing:border-box}
|
||||||
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||||
.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
|
.wrap{text-align:center;width:420px}
|
||||||
.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
|
.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
|
||||||
|
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
|
||||||
|
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
||||||
|
.spinner.hidden{display:none}
|
||||||
@keyframes spin{to{transform:rotate(360deg)}}
|
@keyframes spin{to{transform:rotate(360deg)}}
|
||||||
.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
|
.status{font-size:13px;color:#a0aec0;margin-bottom:20px;min-height:18px}
|
||||||
|
table{width:100%;border-collapse:collapse;font-size:12px;margin-bottom:20px;display:none}
|
||||||
|
td{padding:3px 6px;text-align:left}
|
||||||
|
td:first-child{color:#718096;width:55%}
|
||||||
|
.ok{color:#68d391}
|
||||||
|
.run{color:#f6c90e}
|
||||||
|
.fail{color:#fc8181}
|
||||||
|
.dim{color:#4a5568}
|
||||||
|
.btn{background:#1a202c;color:#a0aec0;border:1px solid #2d3748;padding:7px 18px;font-size:12px;cursor:pointer;font-family:inherit;display:none}
|
||||||
|
.btn:hover{border-color:#718096;color:#e2e8f0}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div style="text-align:center">
|
<div class="wrap">
|
||||||
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
||||||
<div class="spinner"></div>
|
<div class="subtitle">Hardware Audit LiveCD</div>
|
||||||
<div class="status" id="s">Starting up...</div>
|
<div class="spinner" id="spin"></div>
|
||||||
|
<div class="status" id="st">Connecting to bee-web...</div>
|
||||||
|
<table id="tbl"></table>
|
||||||
|
<button class="btn" id="btn" onclick="go()">Open app now</button>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
function probe(){
|
(function(){
|
||||||
fetch('/api/ready',{cache:'no-store'})
|
var gone = false;
|
||||||
.then(function(r){
|
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
||||||
if(r.ok){window.location.replace('/');}
|
|
||||||
else{setTimeout(probe,1000);}
|
function icon(s){
|
||||||
|
if(s==='active') return '<span class="ok">● active</span>';
|
||||||
|
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
||||||
|
if(s==='activating'||s==='reloading') return '<span class="run">○ starting</span>';
|
||||||
|
if(s==='inactive') return '<span class="dim">○ inactive</span>';
|
||||||
|
return '<span class="dim">'+s+'</span>';
|
||||||
|
}
|
||||||
|
|
||||||
|
function allSettled(svcs){
|
||||||
|
for(var i=0;i<svcs.length;i++){
|
||||||
|
var s=svcs[i].state;
|
||||||
|
if(s!=='active'&&s!=='failed'&&s!=='inactive') return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var pollTimer=null;
|
||||||
|
|
||||||
|
function pollServices(){
|
||||||
|
fetch('/api/services',{cache:'no-store'})
|
||||||
|
.then(function(r){return r.json();})
|
||||||
|
.then(function(svcs){
|
||||||
|
if(!svcs||!svcs.length) return;
|
||||||
|
var tbl=document.getElementById('tbl');
|
||||||
|
tbl.style.display='';
|
||||||
|
var html='';
|
||||||
|
for(var i=0;i<svcs.length;i++)
|
||||||
|
html+='<tr><td>'+svcs[i].name+'</td><td>'+icon(svcs[i].state)+'</td></tr>';
|
||||||
|
tbl.innerHTML=html;
|
||||||
|
if(allSettled(svcs)){
|
||||||
|
clearInterval(pollTimer);
|
||||||
|
document.getElementById('spin').className='spinner hidden';
|
||||||
|
document.getElementById('st').textContent='Ready \u2014 opening...';
|
||||||
|
setTimeout(go,800);
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.catch(function(){setTimeout(probe,1000);});
|
.catch(function(){});
|
||||||
|
}
|
||||||
|
|
||||||
|
function probe(){
|
||||||
|
fetch('/healthz',{cache:'no-store'})
|
||||||
|
.then(function(r){
|
||||||
|
if(r.ok){
|
||||||
|
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
||||||
|
document.getElementById('btn').style.display='';
|
||||||
|
pollServices();
|
||||||
|
pollTimer=setInterval(pollServices,1500);
|
||||||
|
} else {
|
||||||
|
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
||||||
|
setTimeout(probe,500);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(function(){
|
||||||
|
document.getElementById('st').textContent='Waiting for bee-web to start...';
|
||||||
|
setTimeout(probe,500);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
probe();
|
probe();
|
||||||
|
})();
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>`
|
</html>`
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
@@ -10,6 +11,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestChartLegendNumber(t *testing.T) {
|
func TestChartLegendNumber(t *testing.T) {
|
||||||
@@ -34,6 +36,59 @@ func TestChartLegendNumber(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
|
||||||
|
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
panic("boom")
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
|
||||||
|
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusInternalServerError {
|
||||||
|
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), "internal server error") {
|
||||||
|
t.Fatalf("body=%q", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
|
||||||
|
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if !sseStart(w) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !sseWrite(w, "tick", "ok") {
|
||||||
|
t.Fatal("expected sse write to succeed")
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/stream", nil)
|
||||||
|
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
|
||||||
|
t.Fatalf("content-type=%q", got)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
|
||||||
|
t.Fatalf("body=%q", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildRuntimeToRAMRowShowsPartialCopyWarning(t *testing.T) {
|
||||||
|
row := buildRuntimeToRAMRow(schema.RuntimeHealth{ToRAMStatus: "partial"})
|
||||||
|
if row.Status != "WARNING" {
|
||||||
|
t.Fatalf("status=%q want WARNING", row.Status)
|
||||||
|
}
|
||||||
|
if !strings.Contains(row.Issue, "Partial or staged RAM copy detected") {
|
||||||
|
t.Fatalf("issue=%q", row.Issue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
samples := []platform.LiveMetricSample{
|
samples := []platform.LiveMetricSample{
|
||||||
{
|
{
|
||||||
@@ -65,7 +120,7 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, labels, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -109,7 +164,7 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatal("chartDataFromSamples returned ok=false")
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
}
|
}
|
||||||
@@ -136,6 +191,39 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1400},
|
||||||
|
{GPUIndex: 3, ClockMHz: 1500},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1410},
|
||||||
|
{GPUIndex: 3, ClockMHz: 1510},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("gpu-all-clock returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Core Clock" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if got := datasets[1][1]; got != 1510 {
|
||||||
|
t.Fatalf("GPU 3 core clock=%v want 1510", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||||
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||||
want := []float64{0, 480, 480, 480, 510, 510}
|
want := []float64{0, 480, 480, 480, 510, 510}
|
||||||
@@ -157,6 +245,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
|||||||
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||||
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||||
}
|
}
|
||||||
|
if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
|
||||||
|
t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="gpu-chart-toggle"`) {
|
||||||
|
t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
|
||||||
|
t.Fatalf("metrics page should include GPU core clock chart: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
||||||
|
t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
|
||||||
|
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChartLegendVisible(t *testing.T) {
|
func TestChartLegendVisible(t *testing.T) {
|
||||||
@@ -199,6 +302,167 @@ func TestChartCanvasHeight(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
end := start.Add(10 * time.Minute)
|
||||||
|
taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
|
||||||
|
s := start.Add(offsetStart)
|
||||||
|
e := start.Add(offsetEnd)
|
||||||
|
return Task{
|
||||||
|
Name: "task",
|
||||||
|
Status: TaskDone,
|
||||||
|
StartedAt: &s,
|
||||||
|
DoneAt: &e,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
segments := chartTimelineSegmentsForRange(start, end, end, []Task{
|
||||||
|
taskWindow(1*time.Minute, 3*time.Minute),
|
||||||
|
taskWindow(2*time.Minute, 5*time.Minute),
|
||||||
|
taskWindow(7*time.Minute, 8*time.Minute),
|
||||||
|
})
|
||||||
|
if len(segments) != 5 {
|
||||||
|
t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
|
||||||
|
}
|
||||||
|
wantActive := []bool{false, true, false, true, false}
|
||||||
|
wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
|
||||||
|
for i, segment := range segments {
|
||||||
|
if segment.Active != wantActive[i] {
|
||||||
|
t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
|
||||||
|
}
|
||||||
|
if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
|
||||||
|
t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
|
||||||
|
}
|
||||||
|
if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
|
||||||
|
t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
labels := []string{"12:00", "12:01", "12:02"}
|
||||||
|
times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
|
||||||
|
svg, err := renderMetricChartSVG(
|
||||||
|
"System Power",
|
||||||
|
labels,
|
||||||
|
times,
|
||||||
|
[][]float64{{300, 320, 310}},
|
||||||
|
[]string{"Power W"},
|
||||||
|
floatPtr(0),
|
||||||
|
floatPtr(400),
|
||||||
|
360,
|
||||||
|
[]chartTimelineSegment{
|
||||||
|
{Start: start, End: start.Add(time.Minute), Active: false},
|
||||||
|
{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
body := string(svg)
|
||||||
|
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||||
|
t.Fatalf("svg missing timeline overlay: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `opacity="0.10"`) {
|
||||||
|
t.Fatalf("svg missing idle overlay opacity: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `System Power`) {
|
||||||
|
t.Fatalf("svg missing chart title: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { _ = db.db.Close() })
|
||||||
|
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
for i, sample := range []platform.LiveMetricSample{
|
||||||
|
{Timestamp: start, PowerW: 300},
|
||||||
|
{Timestamp: start.Add(time.Minute), PowerW: 320},
|
||||||
|
{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
|
||||||
|
} {
|
||||||
|
if err := db.Write(sample); err != nil {
|
||||||
|
t.Fatalf("write sample %d: %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
prevTasks := globalQueue.tasks
|
||||||
|
s := start.Add(30 * time.Second)
|
||||||
|
e := start.Add(90 * time.Second)
|
||||||
|
globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = prevTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
|
||||||
|
h.handleMetricsChartSVG(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||||
|
t.Fatalf("custom svg response missing timeline overlay: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `stroke-linecap="round"`) {
|
||||||
|
t.Fatalf("custom svg response missing custom polyline styling: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesServerPowerUsesResolvedSystemPower(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: start,
|
||||||
|
PSUs: []platform.PSUReading{
|
||||||
|
{Slot: 1, PowerW: 120},
|
||||||
|
{Slot: 2, PowerW: 130},
|
||||||
|
},
|
||||||
|
PowerW: 250,
|
||||||
|
PowerSource: "sdr_psu_input",
|
||||||
|
PowerMode: "autotuned",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: start.Add(time.Minute),
|
||||||
|
PSUs: []platform.PSUReading{
|
||||||
|
{Slot: 1, PowerW: 140},
|
||||||
|
{Slot: 2, PowerW: 135},
|
||||||
|
},
|
||||||
|
PowerW: 275,
|
||||||
|
PowerSource: "sdr_psu_input",
|
||||||
|
PowerMode: "autotuned",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, stacked, ok := chartDataFromSamples("server-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected server-power chart data")
|
||||||
|
}
|
||||||
|
if title != "System Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if stacked {
|
||||||
|
t.Fatal("server-power should use resolved system power, not stacked PSU inputs")
|
||||||
|
}
|
||||||
|
if len(datasets) != 1 || len(names) != 1 {
|
||||||
|
t.Fatalf("datasets=%d names=%d want 1/1", len(datasets), len(names))
|
||||||
|
}
|
||||||
|
if names[0] != "Power W · sdr_psu_input (autotuned)" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||||
@@ -212,21 +476,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChartYAxisOption(t *testing.T) {
|
|
||||||
min := floatPtr(0)
|
|
||||||
max := floatPtr(100)
|
|
||||||
opt := chartYAxisOption(min, max)
|
|
||||||
if opt.Min != min || opt.Max != max {
|
|
||||||
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
|
||||||
}
|
|
||||||
if opt.LabelCount != 11 {
|
|
||||||
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
|
||||||
}
|
|
||||||
if got := opt.ValueFormatter(1000); got != "1к" {
|
|
||||||
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||||
r1 := newMetricsRing(4)
|
r1 := newMetricsRing(4)
|
||||||
r2 := newMetricsRing(4)
|
r2 := newMetricsRing(4)
|
||||||
@@ -335,7 +584,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
|||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
if !strings.Contains(body, `Run Audit`) {
|
if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
|
||||||
t.Fatalf("dashboard missing run audit button: %s", body)
|
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||||
}
|
}
|
||||||
if strings.Contains(body, `No audit data`) {
|
if strings.Contains(body, `No audit data`) {
|
||||||
@@ -343,6 +592,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(rec.Body.String()) != "ready" {
|
||||||
|
t.Fatalf("body=%q want ready", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -365,7 +626,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||||
@@ -373,8 +634,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
|||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
if !strings.Contains(body, `id="task-log-overlay"`) {
|
if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
|
||||||
t.Fatalf("tasks page missing log modal overlay: %s", body)
|
t.Fatalf("tasks page missing task report hint: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||||
@@ -384,7 +645,7 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||||
@@ -392,54 +653,332 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
|||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||||
|
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||||
|
}
|
||||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||||
|
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||||
|
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `Export to USB`) {
|
if !strings.Contains(body, `USB Black-Box`) {
|
||||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
t.Fatalf("tools page missing usb black-box section: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTasksPageRendersScrollableLogModal(t *testing.T) {
|
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||||
dir := t.TempDir()
|
handler := NewHandler(HandlerOptions{})
|
||||||
path := filepath.Join(dir, "audit.json")
|
|
||||||
exportDir := filepath.Join(dir, "export")
|
|
||||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
handler := NewHandler(HandlerOptions{
|
|
||||||
Title: "Bee Hardware Audit",
|
|
||||||
AuditPath: path,
|
|
||||||
ExportDir: exportDir,
|
|
||||||
})
|
|
||||||
|
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||||
if rec.Code != http.StatusOK {
|
if rec.Code != http.StatusOK {
|
||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
if !strings.Contains(body, `height:calc(100vh - 32px)`) {
|
for _, needle := range []string{
|
||||||
t.Fatalf("tasks page missing bounded log modal height: %s", body)
|
`href="/benchmark"`,
|
||||||
|
`id="benchmark-gpu-list"`,
|
||||||
|
`/api/gpu/nvidia`,
|
||||||
|
`/api/bee-bench/nvidia/perf/run`,
|
||||||
|
`/api/bee-bench/nvidia/power/run`,
|
||||||
|
`/api/bee-bench/nvidia/autotune/run`,
|
||||||
|
`/api/bee-bench/nvidia/autotune/status`,
|
||||||
|
`benchmark-run-nccl`,
|
||||||
|
`Run Performance Benchmark`,
|
||||||
|
`Run Power / Thermal Fit`,
|
||||||
|
`Autotune`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
|
}
|
||||||
t.Fatalf("tasks page missing log modal overflow guard: %s", body)
|
|
||||||
|
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
|
result := platform.NvidiaBenchmarkResult{
|
||||||
t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Index: 1,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1168.50,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Perf Results`,
|
||||||
|
`Composite score by saved benchmark run and GPU.`,
|
||||||
|
`GPU 0`,
|
||||||
|
`GPU 1`,
|
||||||
|
`#1`,
|
||||||
|
wantTime,
|
||||||
|
`1176.25`,
|
||||||
|
`1168.50`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA GPU Targeted Stress`,
|
||||||
|
`nvidia-targeted-stress`,
|
||||||
|
`controlled NVIDIA DCGM load`,
|
||||||
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
|
`NVIDIA GPU Selection`,
|
||||||
|
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
||||||
|
`Select All`,
|
||||||
|
`id="sat-gpu-list"`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA Interconnect (NCCL)`,
|
||||||
|
`Validate and Stress:`,
|
||||||
|
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||||
|
`nvbandwidth runs all built-in tests without a time limit`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA Max Compute Load`,
|
||||||
|
`dcgmproftester`,
|
||||||
|
`NCCL`,
|
||||||
|
`Validate → Stress mode`,
|
||||||
|
`id="burn-gpu-list"`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("burn page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskDetailPageRendersSavedReport(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
|
||||||
|
if err := os.MkdirAll(reportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
reportPath := filepath.Join(reportDir, "report.html")
|
||||||
|
if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
ArtifactsDir: reportDir,
|
||||||
|
ReportHTMLPath: reportPath,
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `saved report`) {
|
||||||
|
t.Fatalf("task detail page missing saved report: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Back to Tasks`) {
|
||||||
|
t.Fatalf("task detail page missing back link: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-live-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `Cancel</button>`) {
|
||||||
|
t.Fatalf("task detail page missing cancel button: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `function cancelTaskDetail(id)`) {
|
||||||
|
t.Fatalf("task detail page missing cancel handler: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
|
||||||
|
t.Fatalf("task detail page missing cancel endpoint: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="task-live-charts"`) {
|
||||||
|
t.Fatalf("task detail page missing live charts container: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
|
||||||
|
t.Fatalf("task detail page missing live charts index endpoint: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
db, err := openMetricsDB(metricsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
base := time.Now().UTC()
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
|
||||||
|
{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
|
||||||
|
{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
|
||||||
|
}
|
||||||
|
for _, sample := range samples {
|
||||||
|
if err := db.Write(sample); err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = db.Close()
|
||||||
|
|
||||||
|
started := base.Add(-2*time.Minute - 5*time.Second)
|
||||||
|
done := base.Add(-1*time.Minute + 5*time.Second)
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-chart-1",
|
||||||
|
Name: "Power Window",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: started.Add(-10 * time.Second),
|
||||||
|
StartedAt: &started,
|
||||||
|
DoneAt: &done,
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
|
||||||
|
req.SetPathValue("id", "task-chart-1")
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, "System Power") {
|
||||||
|
t.Fatalf("task chart missing expected title: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "min 200") {
|
||||||
|
t.Fatalf("task chart stats should start from in-window sample: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, "min 100") {
|
||||||
|
t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -564,3 +1103,101 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
|
|||||||
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
health := `{
|
||||||
|
"status":"PARTIAL",
|
||||||
|
"checked_at":"2026-03-16T10:00:00Z",
|
||||||
|
"export_dir":"/tmp/export",
|
||||||
|
"driver_ready":true,
|
||||||
|
"cuda_ready":false,
|
||||||
|
"network_status":"PARTIAL",
|
||||||
|
"issues":[
|
||||||
|
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
|
||||||
|
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
|
||||||
|
],
|
||||||
|
"tools":[
|
||||||
|
{"name":"dmidecode","ok":true},
|
||||||
|
{"name":"nvidia-smi","ok":false}
|
||||||
|
],
|
||||||
|
"services":[
|
||||||
|
{"name":"bee-web","status":"active"},
|
||||||
|
{"name":"bee-nvidia","status":"inactive"}
|
||||||
|
]
|
||||||
|
}`
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
componentStatus := `[
|
||||||
|
{
|
||||||
|
"component_key":"cpu:all",
|
||||||
|
"status":"Warning",
|
||||||
|
"error_summary":"cpu SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"memory:all",
|
||||||
|
"status":"OK",
|
||||||
|
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"storage:nvme0n1",
|
||||||
|
"status":"Critical",
|
||||||
|
"error_summary":"storage SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"pcie:gpu:nvidia",
|
||||||
|
"status":"Warning",
|
||||||
|
"error_summary":"nvidia SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
|
||||||
|
}
|
||||||
|
]`
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
// Runtime Health card — LiveCD checks only
|
||||||
|
`Runtime Health`,
|
||||||
|
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||||
|
`Export Directory`,
|
||||||
|
`Network`,
|
||||||
|
`NVIDIA/AMD Driver`,
|
||||||
|
`CUDA / ROCm`,
|
||||||
|
`Required Utilities`,
|
||||||
|
`Bee Services`,
|
||||||
|
`CUDA runtime is not ready for GPU SAT.`,
|
||||||
|
`Missing: nvidia-smi`,
|
||||||
|
`bee-nvidia=inactive`,
|
||||||
|
// Hardware Summary card — component health badges
|
||||||
|
`Hardware Summary`,
|
||||||
|
`>CPU<`,
|
||||||
|
`>Memory<`,
|
||||||
|
`>Storage<`,
|
||||||
|
`>GPU<`,
|
||||||
|
`>PSU<`,
|
||||||
|
`badge-warn`, // cpu Warning badge
|
||||||
|
`badge-err`, // storage Critical badge
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
71
audit/internal/webui/stability.go
Normal file
71
audit/internal/webui/stability.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"runtime/debug"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
recoverLoopMaxDelay = 60 * time.Second
|
||||||
|
recoverLoopResetAfter = 30 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
// goRecoverLoop starts fn in a goroutine, restarting after panics.
|
||||||
|
// restartDelay is the initial delay; successive panics double it up to
|
||||||
|
// recoverLoopMaxDelay. The delay resets to restartDelay once fn runs
|
||||||
|
// successfully for recoverLoopResetAfter without panicking.
|
||||||
|
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||||
|
go func() {
|
||||||
|
delay := restartDelay
|
||||||
|
consecutive := 0
|
||||||
|
for {
|
||||||
|
start := time.Now()
|
||||||
|
panicked := runRecoverable(name, fn)
|
||||||
|
if !panicked {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
consecutive++
|
||||||
|
if time.Since(start) >= recoverLoopResetAfter {
|
||||||
|
delay = restartDelay
|
||||||
|
consecutive = 1
|
||||||
|
}
|
||||||
|
slog.Warn("goroutine restarting after panic",
|
||||||
|
"component", name,
|
||||||
|
"consecutive_panics", consecutive,
|
||||||
|
"next_delay", delay,
|
||||||
|
)
|
||||||
|
if delay > 0 {
|
||||||
|
time.Sleep(delay)
|
||||||
|
}
|
||||||
|
if delay < recoverLoopMaxDelay {
|
||||||
|
delay *= 2
|
||||||
|
if delay > recoverLoopMaxDelay {
|
||||||
|
delay = recoverLoopMaxDelay
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func goRecoverOnce(name string, fn func()) {
|
||||||
|
go func() {
|
||||||
|
_ = runRecoverable(name, fn)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func runRecoverable(name string, fn func()) (panicked bool) {
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
panicked = true
|
||||||
|
slog.Error("recovered panic",
|
||||||
|
"component", name,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
fn()
|
||||||
|
return false
|
||||||
|
}
|
||||||
267
audit/internal/webui/task_page.go
Normal file
267
audit/internal/webui/task_page.go
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
task, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
snapshot := *task
|
||||||
|
body := renderTaskDetailPage(h.opts, snapshot)
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
|
||||||
|
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
type taskChartIndexEntry struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
File string `json:"file"`
|
||||||
|
}
|
||||||
|
entries := make([]taskChartIndexEntry, 0)
|
||||||
|
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||||
|
title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
_ = json.NewEncoder(w).Encode(entries)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||||
|
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
|
||||||
|
path, ok := taskChartPathFromFile(file)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
|
||||||
|
if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
|
||||||
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||||
|
title := task.Name
|
||||||
|
if strings.TrimSpace(title) == "" {
|
||||||
|
title = task.ID
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||||
|
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||||
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
|
body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||||
|
body.WriteString(`</div>`)
|
||||||
|
|
||||||
|
if report := loadTaskReportFragment(task); report != "" {
|
||||||
|
body.WriteString(report)
|
||||||
|
} else {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
|
||||||
|
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
|
||||||
|
if strings.TrimSpace(task.ErrMsg) != "" {
|
||||||
|
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if task.Status == TaskRunning {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
body.WriteString(`<script>
|
||||||
|
function cancelTaskDetail(id) {
|
||||||
|
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||||
|
var term = document.getElementById('task-live-log');
|
||||||
|
if (term) {
|
||||||
|
term.textContent += '\nCancel requested.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function renderTaskLiveCharts(taskId, charts) {
|
||||||
|
const host = document.getElementById('task-live-charts');
|
||||||
|
if (!host) return;
|
||||||
|
if (!Array.isArray(charts) || charts.length === 0) {
|
||||||
|
host.innerHTML = 'Waiting for metric samples...';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const seen = {};
|
||||||
|
charts.forEach(function(chart) {
|
||||||
|
seen[chart.file] = true;
|
||||||
|
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||||
|
if (img) {
|
||||||
|
const card = img.closest('.card');
|
||||||
|
if (card) {
|
||||||
|
const title = card.querySelector('.card-head');
|
||||||
|
if (title) title.textContent = chart.title;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const card = document.createElement('div');
|
||||||
|
card.className = 'card';
|
||||||
|
card.style.margin = '0';
|
||||||
|
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||||
|
card.querySelector('.card-head').textContent = chart.title;
|
||||||
|
const body = card.querySelector('.card-body');
|
||||||
|
img = document.createElement('img');
|
||||||
|
img.setAttribute('data-task-chart', '1');
|
||||||
|
img.setAttribute('data-chart-file', chart.file);
|
||||||
|
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||||
|
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||||
|
img.style.width = '100%';
|
||||||
|
img.style.display = 'block';
|
||||||
|
img.style.borderRadius = '6px';
|
||||||
|
img.alt = chart.title;
|
||||||
|
body.appendChild(img);
|
||||||
|
host.appendChild(card);
|
||||||
|
});
|
||||||
|
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||||
|
const file = img.getAttribute('data-chart-file') || '';
|
||||||
|
if (seen[file]) return;
|
||||||
|
const card = img.closest('.card');
|
||||||
|
if (card) card.remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadTaskLiveCharts(taskId) {
|
||||||
|
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||||
|
renderTaskLiveCharts(taskId, charts);
|
||||||
|
}).catch(function(){
|
||||||
|
const host = document.getElementById('task-live-charts');
|
||||||
|
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function refreshTaskLiveCharts() {
|
||||||
|
document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
|
||||||
|
const base = img.dataset.baseSrc;
|
||||||
|
if (!base) return;
|
||||||
|
img.src = base + '?t=' + Date.now();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||||
|
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||||
|
var _taskChartTimer = null;
|
||||||
|
var _taskChartsFrozen = false;
|
||||||
|
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||||
|
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||||
|
_taskDetailES.addEventListener('done', function(e){
|
||||||
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
_taskChartsFrozen = true;
|
||||||
|
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||||
|
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
});
|
||||||
|
_taskDetailES.onerror = function(){
|
||||||
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
if (_taskDetailES) {
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
|
_taskChartTimer = setInterval(function(){
|
||||||
|
if (_taskChartsFrozen) return;
|
||||||
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
}, 2000);
|
||||||
|
</script>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
return layoutHead(opts.Title+" — "+title) +
|
||||||
|
layoutNav("tasks", opts.BuildLabel) +
|
||||||
|
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||||
|
body.String() +
|
||||||
|
`</div></div></body></html>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadTaskReportFragment(task Task) string {
|
||||||
|
if strings.TrimSpace(task.ReportHTMLPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return string(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArtifactDownloadLink(task Task, absPath string) string {
|
||||||
|
if strings.TrimSpace(absPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
taskPtr, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
return Task{}, nil, time.Time{}, time.Time{}, false
|
||||||
|
}
|
||||||
|
task := *taskPtr
|
||||||
|
start, end := taskTimeWindow(&task)
|
||||||
|
samples, err := loadTaskMetricSamples(start, end)
|
||||||
|
if err != nil {
|
||||||
|
return task, nil, start, end, true
|
||||||
|
}
|
||||||
|
return task, samples, start, end, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskTimelineForTask(task Task) []chartTimelineSegment {
|
||||||
|
start, end := taskTimeWindow(&task)
|
||||||
|
return []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskChartPathFromFile(file string) (string, bool) {
|
||||||
|
file = strings.TrimSpace(file)
|
||||||
|
for _, spec := range taskDashboardChartSpecs {
|
||||||
|
if spec.File == file {
|
||||||
|
return spec.Path, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
|
||||||
|
id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
|
||||||
|
return "gpu/" + id + "-overview", true
|
||||||
|
}
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
371
audit/internal/webui/task_report.go
Normal file
371
audit/internal/webui/task_report.go
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
var taskReportMetricsDBPath = metricsDBPath
|
||||||
|
|
||||||
|
type taskReport struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Target string `json:"target"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
|
DurationSec int `json:"duration_sec,omitempty"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
LogFile string `json:"log_file,omitempty"`
|
||||||
|
Charts []taskReportChart `json:"charts,omitempty"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskReportChart struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
File string `json:"file"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskChartSpec struct {
|
||||||
|
Path string
|
||||||
|
File string
|
||||||
|
}
|
||||||
|
|
||||||
|
var taskDashboardChartSpecs = []taskChartSpec{
|
||||||
|
{Path: "server-load", File: "server-load.svg"},
|
||||||
|
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
|
||||||
|
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
|
||||||
|
{Path: "server-power", File: "server-power.svg"},
|
||||||
|
{Path: "server-fans", File: "server-fans.svg"},
|
||||||
|
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
|
||||||
|
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
|
||||||
|
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
|
||||||
|
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
|
||||||
|
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
|
||||||
|
specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
|
||||||
|
specs = append(specs, taskDashboardChartSpecs...)
|
||||||
|
for _, idx := range taskGPUIndices(samples) {
|
||||||
|
specs = append(specs, taskChartSpec{
|
||||||
|
Path: fmt.Sprintf("gpu/%d-overview", idx),
|
||||||
|
File: fmt.Sprintf("gpu-%d-overview.svg", idx),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return specs
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskReportArtifacts(t *Task) error {
|
||||||
|
if t == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
start, end := taskTimeWindow(t)
|
||||||
|
samples, _ := loadTaskMetricSamples(start, end)
|
||||||
|
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
|
||||||
|
|
||||||
|
logText := ""
|
||||||
|
if data, err := os.ReadFile(t.LogPath); err == nil {
|
||||||
|
logText = string(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
report := taskReport{
|
||||||
|
ID: t.ID,
|
||||||
|
Name: t.Name,
|
||||||
|
Target: t.Target,
|
||||||
|
Status: t.Status,
|
||||||
|
CreatedAt: t.CreatedAt,
|
||||||
|
StartedAt: t.StartedAt,
|
||||||
|
DoneAt: t.DoneAt,
|
||||||
|
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
|
||||||
|
Error: t.ErrMsg,
|
||||||
|
LogFile: filepath.Base(t.LogPath),
|
||||||
|
Charts: charts,
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func reportDoneTime(t *Task) time.Time {
|
||||||
|
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
return *t.DoneAt
|
||||||
|
}
|
||||||
|
return time.Now()
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskTimeWindow(t *Task) (time.Time, time.Time) {
|
||||||
|
if t == nil {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
return now, now
|
||||||
|
}
|
||||||
|
start := t.CreatedAt.UTC()
|
||||||
|
if t.StartedAt != nil && !t.StartedAt.IsZero() {
|
||||||
|
start = t.StartedAt.UTC()
|
||||||
|
}
|
||||||
|
end := time.Now().UTC()
|
||||||
|
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
end = t.DoneAt.UTC()
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
end = start
|
||||||
|
}
|
||||||
|
return start, end
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||||
|
db, err := openMetricsDB(taskReportMetricsDBPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
return db.LoadBetween(start, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||||
|
var charts []taskReportChart
|
||||||
|
inline := make(map[string]string)
|
||||||
|
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||||
|
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||||
|
if !ok || len(svg) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := filepath.Join(dir, spec.File)
|
||||||
|
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||||
|
inline[spec.File] = string(svg)
|
||||||
|
}
|
||||||
|
return charts, inline
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||||
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
|
buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
|
if err != nil || !hasData {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||||
|
}
|
||||||
|
datasets, names, labels, title, yMin, yMax, stacked, ok := chartDataFromSamples(path, samples)
|
||||||
|
if !ok {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
var buf []byte
|
||||||
|
var err error
|
||||||
|
if stacked {
|
||||||
|
buf, err = renderStackedMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
|
} else {
|
||||||
|
buf, err = renderMetricChartSVG(title, labels, sampleTimes(samples), datasets, names, yMin, yMax, chartCanvasHeightForPath(path, len(names)), timeline)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
return title, buf, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
|
||||||
|
seen := map[int]bool{}
|
||||||
|
var out []int
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if seen[g.GPUIndex] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[g.GPUIndex] = true
|
||||||
|
out = append(out, g.GPUIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Ints(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJSONFile(path string, v any) error {
|
||||||
|
data, err := json.MarshalIndent(v, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, data, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
|
||||||
|
b.WriteString(`<div class="grid2">`)
|
||||||
|
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
|
||||||
|
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
|
||||||
|
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
|
||||||
|
if strings.TrimSpace(report.Error) != "" {
|
||||||
|
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||||
|
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||||
|
b.WriteString(`</div></div></div>`)
|
||||||
|
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||||
|
b.WriteString(benchmarkCard)
|
||||||
|
}
|
||||||
|
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||||
|
b.WriteString(powerCard)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(report.Charts) > 0 {
|
||||||
|
for _, chart := range report.Charts {
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||||
|
b.WriteString(charts[chart.File])
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
|
||||||
|
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||||
|
switch strings.TrimSpace(target) {
|
||||||
|
case "nvidia-bench-perf":
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Perf Results",
|
||||||
|
"Composite score for this benchmark task.",
|
||||||
|
"No benchmark results were saved for this task.",
|
||||||
|
columns,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskPowerResultsCard(target, logText string) string {
|
||||||
|
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(resultPath)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var result platform.NvidiaPowerBenchResult
|
||||||
|
if err := json.Unmarshal(raw, &result); err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||||
|
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
b.WriteString(`</table></div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskBenchmarkResultPath(logText string) string {
|
||||||
|
archivePath := taskArchivePathFromLog(logText)
|
||||||
|
if archivePath == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
return filepath.Join(runDir, "result.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArchivePathFromLog(logText string) string {
|
||||||
|
lines := strings.Split(logText, "\n")
|
||||||
|
for i := len(lines) - 1; i >= 0; i-- {
|
||||||
|
line := strings.TrimSpace(lines[i])
|
||||||
|
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||||
|
if strings.HasPrefix(path, "Archive written to ") {
|
||||||
|
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(path, ".tar.gz") {
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskStatusBadge(status string) string {
|
||||||
|
className := map[string]string{
|
||||||
|
TaskRunning: "badge-ok",
|
||||||
|
TaskPending: "badge-unknown",
|
||||||
|
TaskDone: "badge-ok",
|
||||||
|
TaskFailed: "badge-err",
|
||||||
|
TaskCancelled: "badge-unknown",
|
||||||
|
}[status]
|
||||||
|
if className == "" {
|
||||||
|
className = "badge-unknown"
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(status)
|
||||||
|
if label == "" {
|
||||||
|
label = "unknown"
|
||||||
|
}
|
||||||
|
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatTaskTime(ts *time.Time, fallback time.Time) string {
|
||||||
|
if ts != nil && !ts.IsZero() {
|
||||||
|
return ts.Local().Format("2006-01-02 15:04:05")
|
||||||
|
}
|
||||||
|
if !fallback.IsZero() {
|
||||||
|
return fallback.Local().Format("2006-01-02 15:04:05")
|
||||||
|
}
|
||||||
|
return "n/a"
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatTaskDuration(sec int) string {
|
||||||
|
if sec <= 0 {
|
||||||
|
return "n/a"
|
||||||
|
}
|
||||||
|
if sec < 60 {
|
||||||
|
return fmt.Sprintf("%ds", sec)
|
||||||
|
}
|
||||||
|
if sec < 3600 {
|
||||||
|
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||||
|
}
|
||||||
505
audit/internal/webui/task_runner.go
Normal file
505
audit/internal/webui/task_runner.go
Normal file
@@ -0,0 +1,505 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/runtimeenv"
|
||||||
|
)
|
||||||
|
|
||||||
|
type taskRunnerState struct {
|
||||||
|
PID int `json:"pid"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskRunnerStatePath(t *Task) string {
|
||||||
|
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return filepath.Join(t.ArtifactsDir, "runner-state.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskRunnerState(t *Task, state taskRunnerState) error {
|
||||||
|
path := taskRunnerStatePath(t)
|
||||||
|
if path == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(state, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.Rename(tmp, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
|
||||||
|
path := taskRunnerStatePath(t)
|
||||||
|
if path == "" {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
var state taskRunnerState
|
||||||
|
if err := json.Unmarshal(data, &state); err != nil {
|
||||||
|
return taskRunnerState{}, false
|
||||||
|
}
|
||||||
|
return state, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func processAlive(pid int) bool {
|
||||||
|
if pid <= 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
err := syscall.Kill(pid, 0)
|
||||||
|
return err == nil || err == syscall.EPERM
|
||||||
|
}
|
||||||
|
|
||||||
|
func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
|
||||||
|
now := time.Now()
|
||||||
|
t.DoneAt = &now
|
||||||
|
switch {
|
||||||
|
case cancelled:
|
||||||
|
t.Status = TaskCancelled
|
||||||
|
t.ErrMsg = "aborted"
|
||||||
|
case strings.TrimSpace(errMsg) != "":
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = errMsg
|
||||||
|
default:
|
||||||
|
t.Status = TaskDone
|
||||||
|
t.ErrMsg = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
|
||||||
|
if opts == nil {
|
||||||
|
j.append("ERROR: handler options not configured")
|
||||||
|
j.finish("handler options not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
a := opts.App
|
||||||
|
|
||||||
|
recovered := len(j.lines) > 0
|
||||||
|
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||||
|
if recovered {
|
||||||
|
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
archive string
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
|
||||||
|
switch t.Target {
|
||||||
|
case "nvidia":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
diagLevel := 2
|
||||||
|
if t.params.StressMode {
|
||||||
|
diagLevel = 3
|
||||||
|
}
|
||||||
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
|
result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
archive = result.Body
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
|
}
|
||||||
|
case "nvidia-targeted-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if dur <= 0 {
|
||||||
|
dur = 300
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-bench-perf":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RunNCCL: t.params.RunNCCL,
|
||||||
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-bench-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RampStep: t.params.RampStep,
|
||||||
|
RampTotal: t.params.RampTotal,
|
||||||
|
RampRunID: t.params.RampRunID,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-bench-autotune":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
}, t.params.BenchmarkKind, j.append)
|
||||||
|
case "nvidia-compute":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||||
|
case "nvidia-targeted-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-pulse":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-interconnect":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if planErr != nil {
|
||||||
|
err = planErr
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||||
|
dur = rampPlan.DurationSec
|
||||||
|
}
|
||||||
|
if rampPlan.StaggerSeconds > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||||
|
}
|
||||||
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
|
DurationSec: dur,
|
||||||
|
Loader: t.params.Loader,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||||
|
}, j.append)
|
||||||
|
case "memory":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||||
|
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||||
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||||
|
case "storage":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||||
|
case "cpu":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
if dur <= 0 {
|
||||||
|
if t.params.StressMode {
|
||||||
|
dur = 1800
|
||||||
|
} else {
|
||||||
|
dur = 60
|
||||||
|
}
|
||||||
|
}
|
||||||
|
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||||
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "amd":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-mem":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "memory-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "sat-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "platform-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||||
|
runOpts.Components = t.params.PlatformComponents
|
||||||
|
archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
|
||||||
|
case "audit":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
result, e := a.RunAuditNow(opts.RuntimeMode)
|
||||||
|
if e != nil {
|
||||||
|
err = e
|
||||||
|
} else {
|
||||||
|
for _, line := range splitLines(result.Body) {
|
||||||
|
j.append(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "support-bundle":
|
||||||
|
j.append("Building support bundle...")
|
||||||
|
archive, err = buildSupportBundle(opts.ExportDir)
|
||||||
|
case "install":
|
||||||
|
if strings.TrimSpace(t.params.Device) == "" {
|
||||||
|
err = fmt.Errorf("device is required")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||||
|
j.append("Install log: " + installLogPath)
|
||||||
|
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||||
|
case "install-to-ram":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = a.RunInstallToRAM(ctx, j.append)
|
||||||
|
default:
|
||||||
|
j.append("ERROR: unknown target: " + t.Target)
|
||||||
|
j.finish("unknown target")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if archive != "" {
|
||||||
|
archivePath := app.ExtractArchivePath(archive)
|
||||||
|
if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||||
|
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||||
|
}
|
||||||
|
if opts.App != nil && opts.App.StatusDB != nil {
|
||||||
|
app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
j.append("Aborted.")
|
||||||
|
j.finish("aborted")
|
||||||
|
} else {
|
||||||
|
j.append("ERROR: " + err.Error())
|
||||||
|
j.finish(err.Error())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
j.append("Archive: " + archive)
|
||||||
|
}
|
||||||
|
j.finish("")
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadPersistedTask(statePath, taskID string) (*Task, error) {
|
||||||
|
data, err := os.ReadFile(statePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var persisted []persistedTask
|
||||||
|
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
for _, pt := range persisted {
|
||||||
|
if pt.ID != taskID {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t := &Task{
|
||||||
|
ID: pt.ID,
|
||||||
|
Name: pt.Name,
|
||||||
|
Target: pt.Target,
|
||||||
|
Priority: pt.Priority,
|
||||||
|
Status: pt.Status,
|
||||||
|
CreatedAt: pt.CreatedAt,
|
||||||
|
StartedAt: pt.StartedAt,
|
||||||
|
DoneAt: pt.DoneAt,
|
||||||
|
ErrMsg: pt.ErrMsg,
|
||||||
|
LogPath: pt.LogPath,
|
||||||
|
ArtifactsDir: pt.ArtifactsDir,
|
||||||
|
ReportJSONPath: pt.ReportJSONPath,
|
||||||
|
ReportHTMLPath: pt.ReportHTMLPath,
|
||||||
|
params: pt.Params,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
return t, nil
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("task %s not found", taskID)
|
||||||
|
}
|
||||||
|
|
||||||
|
func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
|
||||||
|
if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
|
||||||
|
fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("resolve runtime for task-run", "err", err)
|
||||||
|
}
|
||||||
|
opts := &HandlerOptions{
|
||||||
|
ExportDir: exportDir,
|
||||||
|
App: app.New(platform.New()),
|
||||||
|
RuntimeMode: runtimeInfo.Mode,
|
||||||
|
}
|
||||||
|
statePath := filepath.Join(exportDir, "tasks-state.json")
|
||||||
|
task, err := loadPersistedTask(statePath, taskID)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
if task.StartedAt == nil || task.StartedAt.IsZero() {
|
||||||
|
now := time.Now()
|
||||||
|
task.StartedAt = &now
|
||||||
|
}
|
||||||
|
if task.Status == "" {
|
||||||
|
task.Status = TaskRunning
|
||||||
|
}
|
||||||
|
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||||
|
PID: os.Getpid(),
|
||||||
|
Status: TaskRunning,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||||
|
executeTaskWithOptions(opts, task, j, ctx)
|
||||||
|
finalizeTaskForResult(task, j.err, ctx.Err() != nil)
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||||
|
}
|
||||||
|
j.closeLog()
|
||||||
|
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||||
|
PID: os.Getpid(),
|
||||||
|
Status: task.Status,
|
||||||
|
Error: task.ErrMsg,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
}); err != nil {
|
||||||
|
fmt.Fprintln(stderr, err.Error())
|
||||||
|
}
|
||||||
|
if task.ErrMsg != "" {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,7 @@ package webui
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
@@ -12,6 +13,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||||
@@ -161,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
origCounter := jobCounter.Load()
|
||||||
|
jobCounter.Store(0)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
jobCounter.Store(origCounter)
|
||||||
|
})
|
||||||
|
|
||||||
|
if got := newJobID("ignored"); got != "TASK-000" {
|
||||||
|
t.Fatalf("id=%q want TASK-000", got)
|
||||||
|
}
|
||||||
|
if got := newJobID("ignored"); got != "TASK-001" {
|
||||||
|
t.Fatalf("id=%q want TASK-001", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||||
|
root := t.TempDir()
|
||||||
|
task := &Task{
|
||||||
|
ID: "TASK-007",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
}
|
||||||
|
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||||
|
if !strings.HasPrefix(got, "007_") {
|
||||||
|
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
logPath := filepath.Join(dir, "task.log")
|
logPath := filepath.Join(dir, "task.log")
|
||||||
@@ -248,15 +284,205 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
|||||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
db, err := openMetricsDB(metricsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
base := time.Now().UTC().Add(-45 * time.Second)
|
||||||
|
if err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base,
|
||||||
|
CPULoadPct: 42,
|
||||||
|
MemLoadPct: 35,
|
||||||
|
PowerW: 510,
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
_ = db.Close()
|
||||||
|
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
started := time.Now().UTC().Add(-90 * time.Second)
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: started.Add(-10 * time.Second),
|
||||||
|
StartedAt: &started,
|
||||||
|
}
|
||||||
|
q.assignTaskLogPathLocked(task)
|
||||||
|
appendJobLog(task.LogPath, "line-1")
|
||||||
|
|
||||||
|
job := newTaskJobState(task.LogPath)
|
||||||
|
job.finish("")
|
||||||
|
q.finalizeTaskRun(task, job)
|
||||||
|
|
||||||
|
if task.Status != TaskDone {
|
||||||
|
t.Fatalf("status=%q want %q", task.Status, TaskDone)
|
||||||
|
}
|
||||||
|
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
|
||||||
|
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(task.ReportJSONPath); err != nil {
|
||||||
|
t.Fatalf("report json: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
|
||||||
|
t.Fatalf("report html: %v", err)
|
||||||
|
}
|
||||||
|
var report taskReport
|
||||||
|
data, err := os.ReadFile(task.ReportJSONPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.json): %v", err)
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(data, &report); err != nil {
|
||||||
|
t.Fatalf("Unmarshal(report.json): %v", err)
|
||||||
|
}
|
||||||
|
if report.ID != task.ID || report.Status != TaskDone {
|
||||||
|
t.Fatalf("report=%+v", report)
|
||||||
|
}
|
||||||
|
if len(report.Charts) == 0 {
|
||||||
|
t.Fatalf("expected charts in report, got none")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
|
||||||
|
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||||
|
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-bench",
|
||||||
|
Name: "NVIDIA Bee Bench Perf",
|
||||||
|
Target: "nvidia-bench-perf",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||||
|
ArtifactsDir: artifactsDir,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(task)
|
||||||
|
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
|
||||||
|
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.html): %v", err)
|
||||||
|
}
|
||||||
|
html := string(body)
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Perf Results`,
|
||||||
|
`Composite score for this benchmark task.`,
|
||||||
|
`GPU 0`,
|
||||||
|
`1176.25`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(html, needle) {
|
||||||
|
t.Fatalf("report missing %q: %s", needle, html)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||||
|
var lines []string
|
||||||
|
prev := taskSerialWriteLine
|
||||||
|
taskSerialWriteLine = func(line string) { lines = append(lines, line) }
|
||||||
|
t.Cleanup(func() { taskSerialWriteLine = prev })
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-serial-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
|
||||||
|
q.enqueue(task)
|
||||||
|
started := time.Now().UTC()
|
||||||
|
task.Status = TaskRunning
|
||||||
|
task.StartedAt = &started
|
||||||
|
job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||||
|
job.append("Starting CPU SAT...")
|
||||||
|
job.append("CPU stress duration: 60s")
|
||||||
|
job.finish("")
|
||||||
|
q.finalizeTaskRun(task, job)
|
||||||
|
|
||||||
|
joined := strings.Join(lines, "\n")
|
||||||
|
for _, needle := range []string{
|
||||||
|
"queued",
|
||||||
|
"Starting CPU SAT...",
|
||||||
|
"CPU stress duration: 60s",
|
||||||
|
"finished with status=done",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(joined, needle) {
|
||||||
|
t.Fatalf("serial mirror missing %q in %q", needle, joined)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveBurnPreset(t *testing.T) {
|
func TestResolveBurnPreset(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
profile string
|
profile string
|
||||||
want burnPreset
|
want burnPreset
|
||||||
}{
|
}{
|
||||||
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
|
||||||
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
|
||||||
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
|
||||||
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
{profile: "", want: burnPreset{DurationSec: 5 * 60}},
|
||||||
}
|
}
|
||||||
for _, tc := range tests {
|
for _, tc := range tests {
|
||||||
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||||
@@ -265,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveNvidiaRampPlan(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
profile string
|
||||||
|
enabled bool
|
||||||
|
selected []int
|
||||||
|
want nvidiaRampSpec
|
||||||
|
wantErr string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "disabled uses base preset",
|
||||||
|
profile: "acceptance",
|
||||||
|
selected: []int{0, 1},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "smoke ramp uses two minute steps",
|
||||||
|
profile: "smoke",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "acceptance ramp uses ten minute steps",
|
||||||
|
profile: "acceptance",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight stays at eight hours when possible",
|
||||||
|
profile: "overnight",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight extends to keep one hour after final gpu",
|
||||||
|
profile: "overnight",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
|
||||||
|
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight rejects impossible gpu count",
|
||||||
|
profile: "overnight",
|
||||||
|
enabled: true,
|
||||||
|
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
|
||||||
|
wantErr: "at most 10 GPUs",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "enabled requires explicit selection",
|
||||||
|
profile: "smoke",
|
||||||
|
enabled: true,
|
||||||
|
wantErr: "requires explicit GPU selection",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
|
||||||
|
if tc.wantErr != "" {
|
||||||
|
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
|
||||||
|
t.Fatalf("err=%v want substring %q", err, tc.wantErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveNvidiaRampPlan error: %v", err)
|
||||||
|
}
|
||||||
|
if got != tc.want {
|
||||||
|
t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
loader string
|
loader string
|
||||||
@@ -369,6 +672,36 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunTaskUsesQuickPresetForMemoryValidate(t *testing.T) {
|
||||||
|
var gotSizeMB, gotPasses int
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "mem-validate-1",
|
||||||
|
Name: "Memory SAT",
|
||||||
|
Target: "memory",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{StressMode: true},
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runMemoryAcceptancePackCtx
|
||||||
|
runMemoryAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, sizeMB, passes int, _ func(string)) (string, error) {
|
||||||
|
gotSizeMB = sizeMB
|
||||||
|
gotPasses = passes
|
||||||
|
return "/tmp/memory-validate.tar.gz", nil
|
||||||
|
}
|
||||||
|
defer func() { runMemoryAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.runTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if gotSizeMB != 512 || gotPasses != 1 {
|
||||||
|
t.Fatalf("memory validate preset=%dMB x%d want 512MB x1", gotSizeMB, gotPasses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
q := &taskQueue{
|
q := &taskQueue{
|
||||||
@@ -467,3 +800,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
|||||||
t.Fatalf("unexpected error: %q", j.err)
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
kmsgWatcher: newKmsgWatcher(nil),
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-panic-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
panic("boom")
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.executeTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if tk.Status != TaskFailed {
|
||||||
|
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
|
||||||
|
}
|
||||||
|
if tk.DoneAt == nil {
|
||||||
|
t.Fatal("expected done_at to be set")
|
||||||
|
}
|
||||||
|
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
|
||||||
|
t.Fatalf("task error=%q", tk.ErrMsg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(j.err, "task panic: boom") {
|
||||||
|
t.Fatalf("job error=%q", j.err)
|
||||||
|
}
|
||||||
|
q.kmsgWatcher.mu.Lock()
|
||||||
|
activeCount := q.kmsgWatcher.activeCount
|
||||||
|
window := q.kmsgWatcher.window
|
||||||
|
q.kmsgWatcher.mu.Unlock()
|
||||||
|
if activeCount != 0 {
|
||||||
|
t.Fatalf("activeCount=%d want 0", activeCount)
|
||||||
|
}
|
||||||
|
if window != nil {
|
||||||
|
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
2
bible
2
bible
Submodule bible updated: 688b87e98d...1d89a4918e
277
bible-local/docs/benchmark-clock-calibration.md
Normal file
277
bible-local/docs/benchmark-clock-calibration.md
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
# Benchmark clock calibration research
|
||||||
|
|
||||||
|
## Benchmark methodology versioning
|
||||||
|
|
||||||
|
Every benchmark methodology change must bump the benchmark version constant in
|
||||||
|
source code by exactly `+1`.
|
||||||
|
|
||||||
|
Methodology change means any change that affects comparability of benchmark
|
||||||
|
results, including for example:
|
||||||
|
- phase durations or phase order
|
||||||
|
- enabled/disabled precisions
|
||||||
|
- fallback rules
|
||||||
|
- normalization rules
|
||||||
|
- score formulas or weights
|
||||||
|
- degradation thresholds
|
||||||
|
- power calibration logic
|
||||||
|
- thermal/power penalty logic
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- benchmark version must be stored in source code as an explicit version
|
||||||
|
constant, not inferred from git tag or build metadata
|
||||||
|
- benchmark report must always print the benchmark version
|
||||||
|
- `result.json` must always include the benchmark version
|
||||||
|
- results from different benchmark versions must be treated as non-comparable by
|
||||||
|
default
|
||||||
|
|
||||||
|
Purpose:
|
||||||
|
- prevent accidental comparison of runs produced by different methodologies
|
||||||
|
- make historical benchmark archives self-describing even when detached from git
|
||||||
|
- force deliberate version bumps whenever scoring or execution semantics change
|
||||||
|
|
||||||
|
## Status
|
||||||
|
In progress. Baseline data from production servers pending.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
|
||||||
|
before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
|
||||||
|
`avg_steady_clock < locked_target * 0.90`.
|
||||||
|
|
||||||
|
Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
|
||||||
|
even a healthy GPU in a non-ideal server will sustain clocks well below boost.
|
||||||
|
The 90% threshold has no empirical basis.
|
||||||
|
|
||||||
|
## Key observations (2026-04-06)
|
||||||
|
|
||||||
|
### H100 PCIe — new card, server not designed for it
|
||||||
|
- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
|
||||||
|
- Stability: 70.0 — clocks erratic, no equilibrium found
|
||||||
|
- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
|
||||||
|
|
||||||
|
### H200 NVL — new card, server not designed for it
|
||||||
|
- avg clock = P95 = 1635 MHz (perfectly stable)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
|
||||||
|
- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
|
||||||
|
- Degradation: power_capped, thermal_limited
|
||||||
|
- Compute: 989 TOPS — card is computing correctly for its frequency
|
||||||
|
|
||||||
|
### Key insight
|
||||||
|
The meaningful distinction is not *whether* the card throttles but *how stably*
|
||||||
|
it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
|
||||||
|
H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
|
||||||
|
instability may reflect a more severe thermal mismatch or a card issue.
|
||||||
|
|
||||||
|
`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
|
||||||
|
`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
|
||||||
|
|
||||||
|
## Hypothesis for baseline
|
||||||
|
|
||||||
|
After testing on servers designed for their GPUs (proper cooling):
|
||||||
|
- Healthy GPU under sustained load will run at a stable fraction of boost
|
||||||
|
- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
|
||||||
|
- Base clock (`clocks.base.gr`) may be a better reference than boost:
|
||||||
|
a healthy card under real workload should comfortably exceed base clock
|
||||||
|
|
||||||
|
## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
|
||||||
|
|
||||||
|
Source: external stress test tool, ~90s runs, designed server, adequate power.
|
||||||
|
|
||||||
|
### Healthy fingerprint
|
||||||
|
|
||||||
|
- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
|
||||||
|
- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
|
||||||
|
- Avg steady (visual): **~1580–1620 MHz**
|
||||||
|
- vs boost 1755 MHz: **~91–92%**
|
||||||
|
- Oscillation is NORMAL — this is the boost algorithm balancing under power cap
|
||||||
|
- Stable power + oscillating clocks = healthy power-cap behavior
|
||||||
|
- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
|
||||||
|
- **Consistency**: all 10 samples within ±20 MHz — very repeatable
|
||||||
|
|
||||||
|
### Characteristic patten
|
||||||
|
Flat power line + oscillating/declining clock line = GPU correctly managed by
|
||||||
|
power cap algorithm. Do NOT flag this as instability.
|
||||||
|
|
||||||
|
### Clock CV implication
|
||||||
|
The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
|
||||||
|
The current `variance_too_high` threshold (StabilityScore < 85) may fire on
|
||||||
|
healthy HBM2e PCIe cards. Needs recalibration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
|
||||||
|
|
||||||
|
Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
|
||||||
|
Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
|
||||||
|
|
||||||
|
### GPU clock reference (from nvidia-smi, idle):
|
||||||
|
- base_clock_mhz: **1095**
|
||||||
|
- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
|
||||||
|
- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
|
||||||
|
- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
|
||||||
|
|
||||||
|
### Observed under 700W sustained load (both samples nearly identical):
|
||||||
|
- Power: ~700W flat — SXM slot, adequate power confirmed
|
||||||
|
- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
|
||||||
|
- vs 1980 MHz (lock target): **72–74%** — severely below
|
||||||
|
- vs 1755 MHz (nvidia-smi boost): **81–83%**
|
||||||
|
- vs 1095 MHz (base): 130% — above base but far below expected for SXM
|
||||||
|
- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
|
||||||
|
- Temperature: 38°C → 79–80°C (same rate as HBM2e)
|
||||||
|
- Oscillation: present, similar character to HBM2e but at much lower frequency
|
||||||
|
|
||||||
|
### Diagnosis
|
||||||
|
These restored cards are degraded. A healthy H100 SXM in a designed server
|
||||||
|
(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
|
||||||
|
The 72–74% result is a clear signal of silicon or VRM degradation from the
|
||||||
|
refurbishment process.
|
||||||
|
|
||||||
|
### Clock pattern note
|
||||||
|
Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
|
||||||
|
to images 19/20. Both sample sets show same degraded pattern — same batch.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline matrix (filled where data available)
|
||||||
|
|
||||||
|
| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
|
||||||
|
| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
|
||||||
|
| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
|
||||||
|
| H200 NVL | designed | TBD | TBD | TBD | need baseline |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## H100 official spec (from NVIDIA datasheet)
|
||||||
|
|
||||||
|
Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
|
||||||
|
All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
|
||||||
|
| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
|
||||||
|
| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
|
||||||
|
| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- SXM boards do NOT list FP8 peak in this table (field empty)
|
||||||
|
- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
|
||||||
|
- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
|
||||||
|
|
||||||
|
## Observed efficiency (H100 80GB PCIe, throttled server)
|
||||||
|
|
||||||
|
From the report in this session (power+thermal throttle throughout steady):
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
|
||||||
|
| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
|
||||||
|
| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
|
||||||
|
|
||||||
|
33–44% of spec is expected given sustained power+thermal throttle (avg clock
|
||||||
|
1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
|
||||||
|
actual frequency — the low TOPS comes from throttle, not silicon defect.
|
||||||
|
|
||||||
|
## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
|
||||||
|
|
||||||
|
Format: without sparsity / with sparsity.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
|
||||||
|
| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
|
||||||
|
|
||||||
|
## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
|
||||||
|
|
||||||
|
Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
|
||||||
|
| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
|
||||||
|
| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
|
||||||
|
|
||||||
|
Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
|
||||||
|
both are throttle-limited. Confirms that % of spec is not a quality signal,
|
||||||
|
it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
|
||||||
|
|
||||||
|
## Real-world GEMM efficiency reference (2026-04-06, web research)
|
||||||
|
|
||||||
|
Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
|
||||||
|
worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
|
||||||
|
|
||||||
|
### What healthy systems actually achieve:
|
||||||
|
- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
|
||||||
|
- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
|
||||||
|
- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
|
||||||
|
|
||||||
|
### Our results vs expectation:
|
||||||
|
| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
|
||||||
|
| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
|
||||||
|
|
||||||
|
Our results are roughly **half** of what a healthy system achieves even under throttle.
|
||||||
|
This is NOT normal — 30-44% is not the industry baseline.
|
||||||
|
|
||||||
|
### Likely causes of the gap (in order of probability):
|
||||||
|
1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
|
||||||
|
2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
|
||||||
|
Previous user may have set a lower limit via nvidia-smi -pl and it was not
|
||||||
|
reset. Our normalization sets clock locks but does NOT reset power limit.
|
||||||
|
Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
|
||||||
|
3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
|
||||||
|
8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
|
||||||
|
|
||||||
|
### Power limit gap analysis (H100 PCIe):
|
||||||
|
- Avg clock 1384 MHz = 79% of boost 1755 MHz
|
||||||
|
- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
|
||||||
|
- Actually measured: 329 TOPS = 55% of that estimate
|
||||||
|
- Remaining gap after accounting for clock throttle: ~45%
|
||||||
|
- Most likely explanation: enforced power limit < 350W TDP, further reducing
|
||||||
|
sustainable clock beyond what sw_thermal alone would cause.
|
||||||
|
|
||||||
|
### Action item:
|
||||||
|
Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
|
||||||
|
so result.json shows if the card was pre-configured with a non-default limit.
|
||||||
|
If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
|
||||||
|
|
||||||
|
### CPU/RAM impact on GPU FLOPS:
|
||||||
|
None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
|
||||||
|
CPU core count and host RAM are irrelevant.
|
||||||
|
|
||||||
|
## Compute efficiency metric (proposed, no hardcode)
|
||||||
|
|
||||||
|
Instead of comparing TOPS to a hardcoded spec, compute:
|
||||||
|
tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
|
||||||
|
|
||||||
|
This is model-agnostic. A GPU computing correctly at its actual frequency
|
||||||
|
will show a consistent tops_per_sm_per_ghz regardless of throttle level.
|
||||||
|
A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
|
||||||
|
normal clocks.
|
||||||
|
|
||||||
|
SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
|
||||||
|
(needs to be added to queryBenchmarkGPUInfo).
|
||||||
|
|
||||||
|
Reference values to establish after baseline runs:
|
||||||
|
- H100 PCIe fp16_tensor: TBD tops/SM/GHz
|
||||||
|
- H100 SXM fp16_tensor: TBD tops/SM/GHz
|
||||||
|
|
||||||
|
## Proposed threshold changes (pending more data)
|
||||||
|
|
||||||
|
1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
|
||||||
|
91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
|
||||||
|
capture the root cause.
|
||||||
|
|
||||||
|
2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
|
||||||
|
under power cap. Consider suppressing this flag when power is flat and usage
|
||||||
|
is 100% (oscillation is expected). Or lower threshold to 70.
|
||||||
|
|
||||||
|
3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
|
||||||
|
ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
|
||||||
|
would have been caught by this).
|
||||||
|
|
||||||
|
Decision deferred until baseline on SXM designed servers collected.
|
||||||
121
bible-local/docs/gpu-model-propagation.md
Normal file
121
bible-local/docs/gpu-model-propagation.md
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
# GPU Model Name Propagation
|
||||||
|
|
||||||
|
How GPU model names are detected, stored, and displayed throughout the project.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Detection Sources
|
||||||
|
|
||||||
|
There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
|
||||||
|
|
||||||
|
### Pipeline A — Live / SAT (nvidia-smi query at runtime)
|
||||||
|
|
||||||
|
**File:** `audit/internal/platform/sat.go`
|
||||||
|
|
||||||
|
- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
|
||||||
|
- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
|
||||||
|
- Used by: GPU selection UI, live metrics labels, burn/stress test logic
|
||||||
|
|
||||||
|
### Pipeline B — Benchmark results
|
||||||
|
|
||||||
|
**File:** `audit/internal/platform/benchmark.go`, line 124
|
||||||
|
|
||||||
|
- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
|
||||||
|
- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
|
||||||
|
- Used by: benchmark history table, benchmark report
|
||||||
|
|
||||||
|
### Pipeline C — Hardware audit JSON (PCIe schema)
|
||||||
|
|
||||||
|
**File:** `audit/internal/schema/hardware.go`
|
||||||
|
|
||||||
|
- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
|
||||||
|
- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
|
||||||
|
- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
|
||||||
|
- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key Inconsistency: NVIDIA PCIe Model is Never Set
|
||||||
|
|
||||||
|
`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
|
||||||
|
|
||||||
|
This means:
|
||||||
|
- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
|
||||||
|
- AMD GPUs do have their model populated
|
||||||
|
|
||||||
|
The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Benchmark History "Unknown GPU" Issue
|
||||||
|
|
||||||
|
**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
|
||||||
|
|
||||||
|
**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
|
||||||
|
|
||||||
|
This happens for:
|
||||||
|
- Older result files saved before the `Name` field was added
|
||||||
|
- Runs where nvidia-smi query failed before the benchmark started
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Fallback Strings — Current State
|
||||||
|
|
||||||
|
| Location | File | Fallback string |
|
||||||
|
|---|---|---|
|
||||||
|
| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
|
||||||
|
| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
|
||||||
|
| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
|
||||||
|
| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
|
||||||
|
| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
|
||||||
|
| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
|
||||||
|
| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
|
||||||
|
| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
|
||||||
|
|
||||||
|
**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## GPU Selection UI
|
||||||
|
|
||||||
|
**File:** `audit/internal/webui/pages.go`
|
||||||
|
|
||||||
|
- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
|
||||||
|
- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
|
||||||
|
- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
|
||||||
|
|
||||||
|
This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Flow Summary
|
||||||
|
|
||||||
|
```
|
||||||
|
nvidia-smi (live)
|
||||||
|
└─ ListNvidiaGPUs() → NvidiaGPU.Name
|
||||||
|
├─ GPU selection UI (always correct)
|
||||||
|
├─ Live metrics labels (charts_svg.go)
|
||||||
|
└─ SAT/burn status file (sat.go)
|
||||||
|
|
||||||
|
nvidia-smi (at benchmark start)
|
||||||
|
└─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
|
||||||
|
└─ BenchmarkGPUResult.Name (json:"name,omitempty")
|
||||||
|
├─ Benchmark report
|
||||||
|
└─ Benchmark history table columns
|
||||||
|
|
||||||
|
nvidia-smi / lspci (audit collection)
|
||||||
|
└─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
|
||||||
|
└─ Hardware summary page hwDescribeGPU()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Fixed Issues
|
||||||
|
|
||||||
|
All previously open items are resolved:
|
||||||
|
|
||||||
|
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` sets `dev.Model = &v` (`nvidia.go:78`).
|
||||||
|
2. **Fallback consistency** — `sat.go` and `benchmark_report.go` both use `"Unknown GPU"`.
|
||||||
|
3. **`tops_per_sm_per_ghz`** — computed in `benchmark.go` and stored in `BenchmarkGPUScore.TOPSPerSMPerGHz`.
|
||||||
|
4. **`MultiprocessorCount`, `PowerLimitW`, `DefaultPowerLimitW`** — present in `benchmark_types.go`.
|
||||||
|
5. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue).
|
||||||
@@ -15,6 +15,41 @@ This applies to:
|
|||||||
- `iso/builder/config/package-lists/*.list.chroot`
|
- `iso/builder/config/package-lists/*.list.chroot`
|
||||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||||
|
|
||||||
|
## Bootloader sync rule
|
||||||
|
|
||||||
|
The ISO has two independent bootloader configs that must be kept in sync manually:
|
||||||
|
|
||||||
|
| File | Used by |
|
||||||
|
|------|---------|
|
||||||
|
| `config/bootloaders/grub-efi/grub.cfg` | UEFI (all modern servers) |
|
||||||
|
| `config/bootloaders/isolinux/live.cfg.in` | CSM / legacy BIOS (syslinux) |
|
||||||
|
|
||||||
|
live-build does NOT derive one from the other. Any new boot entry, kernel parameter
|
||||||
|
change, or new mode added to one file must be manually mirrored in the other.
|
||||||
|
|
||||||
|
**Canonical entry list** (both files must have all of these):
|
||||||
|
|
||||||
|
| Label | Key params |
|
||||||
|
|-------|-----------|
|
||||||
|
| normal (default) | `nomodeset bee.nvidia.mode=normal` + full param set |
|
||||||
|
| load to RAM | `toram nomodeset bee.nvidia.mode=normal` + full param set |
|
||||||
|
| GSP=off | `nomodeset bee.nvidia.mode=gsp-off` + full param set |
|
||||||
|
| KMS | no `nomodeset`, `bee.nvidia.mode=normal` + full param set |
|
||||||
|
| KMS + GSP=off | no `nomodeset`, `bee.nvidia.mode=gsp-off` + full param set |
|
||||||
|
| fail-safe | `nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp` |
|
||||||
|
|
||||||
|
**Full standard param set** (append after `@APPEND_LIVE@` / `nomodeset` flags):
|
||||||
|
```
|
||||||
|
net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always
|
||||||
|
numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||||
|
nowatchdog nosoftlockup
|
||||||
|
```
|
||||||
|
(fail-safe is the exception — it deliberately uses minimal params.)
|
||||||
|
|
||||||
|
**Historical note:** `grub-pc/` was mistakenly used instead of `grub-efi/` until v8.25.
|
||||||
|
live-build reads `config/bootloaders/grub-efi/` for UEFI because the build is
|
||||||
|
configured with `--bootloaders "grub-efi,syslinux"`. Directory `grub-pc` is ignored.
|
||||||
|
|
||||||
## Memtest rule
|
## Memtest rule
|
||||||
|
|
||||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
DEBIAN_VERSION=12
|
DEBIAN_VERSION=12
|
||||||
DEBIAN_KERNEL_ABI=auto
|
DEBIAN_KERNEL_ABI=auto
|
||||||
NVIDIA_DRIVER_VERSION=590.48.01
|
NVIDIA_DRIVER_VERSION=590.48.01
|
||||||
|
NVIDIA_FABRICMANAGER_VERSION=590.48.01-1
|
||||||
NCCL_VERSION=2.28.9-1
|
NCCL_VERSION=2.28.9-1
|
||||||
NCCL_CUDA_VERSION=13.0
|
NCCL_CUDA_VERSION=13.0
|
||||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||||
NCCL_TESTS_VERSION=2.13.10
|
NCCL_TESTS_VERSION=2.13.10
|
||||||
NVCC_VERSION=12.8
|
NVCC_VERSION=12.8
|
||||||
CUBLAS_VERSION=13.0.2.14-1
|
CUBLAS_VERSION=13.1.1.3-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
DCGM_VERSION=4.5.3-1
|
DCGM_VERSION=4.5.3-1
|
||||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
@@ -21,3 +22,4 @@ HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
|||||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
MEMTEST_VERSION=6.10-4
|
||||||
|
|||||||
@@ -23,16 +23,17 @@ lb config noauto \
|
|||||||
--bootloaders "grub-efi,syslinux" \
|
--bootloaders "grub-efi,syslinux" \
|
||||||
--debian-installer none \
|
--debian-installer none \
|
||||||
--archive-areas "main contrib non-free non-free-firmware" \
|
--archive-areas "main contrib non-free non-free-firmware" \
|
||||||
--mirror-bootstrap "https://deb.debian.org/debian" \
|
--mirror-bootstrap "http://mirror.mephi.ru/debian/" \
|
||||||
--mirror-chroot "https://deb.debian.org/debian" \
|
--mirror-chroot "http://mirror.mephi.ru/debian/" \
|
||||||
--mirror-binary "https://deb.debian.org/debian" \
|
--mirror-binary "http://mirror.mephi.ru/debian/" \
|
||||||
--security true \
|
--security true \
|
||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
|
--debootstrap-options "--include=ca-certificates" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -33,10 +33,10 @@ typedef void *CUstream;
|
|||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||||
#define MAX_STRESS_STREAMS 16
|
#define MAX_STRESS_STREAMS 16
|
||||||
#define MAX_CUBLAS_PROFILES 5
|
|
||||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
#define STRESS_LAUNCH_DEPTH 8
|
#define MAX_SINGLE_PRECISION_STREAMS 4
|
||||||
|
#define MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES ((size_t)2u * 1024u * 1024u * 1024u)
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -298,6 +298,13 @@ static int choose_stream_count(int mp_count, int planned_profiles, size_t total_
|
|||||||
return stream_count;
|
return stream_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static size_t clamp_single_precision_profile_budget(size_t profile_budget_bytes) {
|
||||||
|
if (profile_budget_bytes > MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES) {
|
||||||
|
return MAX_SINGLE_PRECISION_PROFILE_BUDGET_BYTES;
|
||||||
|
}
|
||||||
|
return profile_budget_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||||
if (!api->cuStreamDestroy) {
|
if (!api->cuStreamDestroy) {
|
||||||
return;
|
return;
|
||||||
@@ -344,7 +351,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
unsigned long iterations = 0;
|
unsigned long iterations = 0;
|
||||||
int mp_count = 0;
|
int mp_count = 0;
|
||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int launches_per_wave = 0;
|
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||||
@@ -419,44 +425,42 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
|
|
||||||
unsigned int threads = 256;
|
unsigned int threads = 256;
|
||||||
|
|
||||||
double start = now_seconds();
|
double deadline = now_seconds() + (double)seconds;
|
||||||
double deadline = start + (double)seconds;
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
launches_per_wave = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
int launched_this_batch = 0;
|
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||||
for (int lane = 0; lane < stream_count; lane++) {
|
if (!check_rc(api,
|
||||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
"cuLaunchKernel",
|
||||||
if (!check_rc(api,
|
api->cuLaunchKernel(kernel,
|
||||||
"cuLaunchKernel",
|
blocks,
|
||||||
api->cuLaunchKernel(kernel,
|
1,
|
||||||
blocks,
|
1,
|
||||||
1,
|
threads,
|
||||||
1,
|
1,
|
||||||
threads,
|
1,
|
||||||
1,
|
0,
|
||||||
1,
|
streams[lane],
|
||||||
0,
|
params[lane],
|
||||||
streams[lane],
|
NULL))) {
|
||||||
params[lane],
|
goto fail;
|
||||||
NULL))) {
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
launches_per_wave++;
|
|
||||||
launched_this_batch++;
|
|
||||||
}
|
|
||||||
if (launched_this_batch <= 0) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
launched++;
|
||||||
|
iterations++;
|
||||||
}
|
}
|
||||||
if (launches_per_wave <= 0) {
|
if (launched <= 0) {
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
double now = now_seconds();
|
||||||
goto fail;
|
if (now >= next_sync || now >= deadline) {
|
||||||
|
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
next_sync = now + 1.0;
|
||||||
}
|
}
|
||||||
iterations += (unsigned long)launches_per_wave;
|
|
||||||
}
|
}
|
||||||
|
api->cuCtxSynchronize();
|
||||||
|
|
||||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||||
goto fail;
|
goto fail;
|
||||||
@@ -468,11 +472,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
report->iterations = iterations;
|
report->iterations = iterations;
|
||||||
snprintf(report->details,
|
snprintf(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
bytes_per_stream[0] / (1024u * 1024u),
|
bytes_per_stream[0] / (1024u * 1024u),
|
||||||
iterations);
|
iterations);
|
||||||
|
|
||||||
@@ -606,6 +609,20 @@ struct prepared_profile {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const struct profile_desc k_profiles[] = {
|
static const struct profile_desc k_profiles[] = {
|
||||||
|
{
|
||||||
|
"fp64",
|
||||||
|
"fp64",
|
||||||
|
80,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
8,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUBLAS_COMPUTE_64F,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"fp32_tf32",
|
"fp32_tf32",
|
||||||
"fp32",
|
"fp32",
|
||||||
@@ -634,6 +651,20 @@ static const struct profile_desc k_profiles[] = {
|
|||||||
CUDA_R_16F,
|
CUDA_R_16F,
|
||||||
CUBLAS_COMPUTE_32F_FAST_16F,
|
CUBLAS_COMPUTE_32F_FAST_16F,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"int8_tensor",
|
||||||
|
"int8",
|
||||||
|
75,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
128,
|
||||||
|
CUDA_R_8I,
|
||||||
|
CUDA_R_8I,
|
||||||
|
CUDA_R_32I,
|
||||||
|
CUDA_R_32I,
|
||||||
|
CUBLAS_COMPUTE_32I,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"fp8_e4m3",
|
"fp8_e4m3",
|
||||||
"fp8",
|
"fp8",
|
||||||
@@ -680,6 +711,21 @@ static const struct profile_desc k_profiles[] = {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
||||||
|
|
||||||
|
static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
|
||||||
|
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
return strcmp(desc->block_label, precision_filter) == 0;
|
||||||
|
}
|
||||||
|
/* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
|
||||||
|
* unstable on the current benchmark fleet and can abort the whole mixed
|
||||||
|
* pass after earlier phases already collected useful telemetry. */
|
||||||
|
return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int load_cublaslt(struct cublaslt_api *api) {
|
static int load_cublaslt(struct cublaslt_api *api) {
|
||||||
memset(api, 0, sizeof(*api));
|
memset(api, 0, sizeof(*api));
|
||||||
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
||||||
@@ -750,10 +796,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
|
|||||||
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case CUDA_R_32F:
|
case CUDA_R_32F:
|
||||||
|
case CUDA_R_32I:
|
||||||
return (size_t)(elements * 4u);
|
return (size_t)(elements * 4u);
|
||||||
case CUDA_R_16F:
|
case CUDA_R_16F:
|
||||||
case CUDA_R_16BF:
|
case CUDA_R_16BF:
|
||||||
return (size_t)(elements * 2u);
|
return (size_t)(elements * 2u);
|
||||||
|
case CUDA_R_8I:
|
||||||
case CUDA_R_8F_E4M3:
|
case CUDA_R_8F_E4M3:
|
||||||
case CUDA_R_8F_E5M2:
|
case CUDA_R_8F_E5M2:
|
||||||
return (size_t)(elements);
|
return (size_t)(elements);
|
||||||
@@ -766,6 +814,16 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
|
||||||
|
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
|
||||||
|
return CUDA_R_32I;
|
||||||
|
}
|
||||||
|
if (desc->compute_type == CUBLAS_COMPUTE_64F) {
|
||||||
|
return CUDA_R_64F;
|
||||||
|
}
|
||||||
|
return CUDA_R_32F;
|
||||||
|
}
|
||||||
|
|
||||||
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
||||||
uint64_t row_tiles = (rows + 127u) / 128u;
|
uint64_t row_tiles = (rows + 127u) / 128u;
|
||||||
uint64_t col_tiles = (cols + 63u) / 64u;
|
uint64_t col_tiles = (cols + 63u) / 64u;
|
||||||
@@ -872,11 +930,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
CUstream stream,
|
CUstream stream,
|
||||||
size_t profile_budget_bytes,
|
size_t profile_budget_bytes,
|
||||||
struct prepared_profile *out) {
|
struct prepared_profile *out) {
|
||||||
memset(out, 0, sizeof(*out));
|
|
||||||
out->desc = *desc;
|
|
||||||
out->stream = stream;
|
|
||||||
|
|
||||||
size_t bytes_per_cell = 0;
|
size_t bytes_per_cell = 0;
|
||||||
|
size_t attempt_budget = profile_budget_bytes;
|
||||||
|
|
||||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||||
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->b_type, 1);
|
||||||
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->c_type, 1);
|
||||||
@@ -885,105 +941,115 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t dim = choose_square_dim(profile_budget_bytes, bytes_per_cell, desc->min_multiple);
|
while (attempt_budget >= MIN_PROFILE_BUDGET_BYTES) {
|
||||||
out->m = dim;
|
memset(out, 0, sizeof(*out));
|
||||||
out->n = dim;
|
out->desc = *desc;
|
||||||
out->k = dim;
|
out->stream = stream;
|
||||||
|
|
||||||
size_t desired_workspace = profile_budget_bytes / 8u;
|
uint64_t dim = choose_square_dim(attempt_budget, bytes_per_cell, desc->min_multiple);
|
||||||
if (desired_workspace > 32u * 1024u * 1024u) {
|
out->m = dim;
|
||||||
desired_workspace = 32u * 1024u * 1024u;
|
out->n = dim;
|
||||||
}
|
out->k = dim;
|
||||||
desired_workspace = round_down_size(desired_workspace, 256u);
|
|
||||||
|
|
||||||
size_t a_bytes = 0;
|
size_t desired_workspace = attempt_budget / 8u;
|
||||||
size_t b_bytes = 0;
|
if (desired_workspace > 32u * 1024u * 1024u) {
|
||||||
size_t c_bytes = 0;
|
desired_workspace = 32u * 1024u * 1024u;
|
||||||
size_t d_bytes = 0;
|
}
|
||||||
size_t scale_bytes = 0;
|
desired_workspace = round_down_size(desired_workspace, 256u);
|
||||||
while (1) {
|
|
||||||
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
|
||||||
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
|
||||||
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
|
||||||
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
|
||||||
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
|
||||||
|
|
||||||
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
size_t a_bytes = 0;
|
||||||
if (matrix_bytes <= profile_budget_bytes) {
|
size_t b_bytes = 0;
|
||||||
size_t remaining = profile_budget_bytes - matrix_bytes;
|
size_t c_bytes = 0;
|
||||||
out->workspace_size = desired_workspace;
|
size_t d_bytes = 0;
|
||||||
if (out->workspace_size > remaining) {
|
size_t scale_bytes = 0;
|
||||||
out->workspace_size = round_down_size(remaining, 256u);
|
while (1) {
|
||||||
|
a_bytes = bytes_for_elements(desc->a_type, out->k * out->m);
|
||||||
|
b_bytes = bytes_for_elements(desc->b_type, out->k * out->n);
|
||||||
|
c_bytes = bytes_for_elements(desc->c_type, out->m * out->n);
|
||||||
|
d_bytes = bytes_for_elements(desc->d_type, out->m * out->n);
|
||||||
|
scale_bytes = profile_scale_bytes(desc, out->m, out->n, out->k);
|
||||||
|
|
||||||
|
size_t matrix_bytes = a_bytes + b_bytes + c_bytes + d_bytes + scale_bytes;
|
||||||
|
if (matrix_bytes <= attempt_budget) {
|
||||||
|
size_t remaining = attempt_budget - matrix_bytes;
|
||||||
|
out->workspace_size = desired_workspace;
|
||||||
|
if (out->workspace_size > remaining) {
|
||||||
|
out->workspace_size = round_down_size(remaining, 256u);
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
|
if (out->m <= (uint64_t)desc->min_multiple) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
out->m -= (uint64_t)desc->min_multiple;
|
||||||
|
out->n = out->m;
|
||||||
|
out->k = out->m;
|
||||||
|
}
|
||||||
|
if (out->m < (uint64_t)desc->min_multiple) {
|
||||||
|
attempt_budget /= 2u;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (out->m <= (uint64_t)desc->min_multiple) {
|
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
||||||
return 0;
|
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
||||||
}
|
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
||||||
out->m -= (uint64_t)desc->min_multiple;
|
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
||||||
out->n = out->m;
|
|
||||||
out->k = out->m;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!alloc_filled(cuda, &out->a_dev, a_bytes, 0x11) ||
|
|
||||||
!alloc_filled(cuda, &out->b_dev, b_bytes, 0x11) ||
|
|
||||||
!alloc_filled(cuda, &out->c_dev, c_bytes, 0x00) ||
|
|
||||||
!alloc_filled(cuda, &out->d_dev, d_bytes, 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
|
||||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
cublasOperation_t transa = CUBLAS_OP_T;
|
|
||||||
cublasOperation_t transb = CUBLAS_OP_N;
|
|
||||||
if (!check_cublas("set TRANSA",
|
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
||||||
CUBLASLT_MATMUL_DESC_TRANSA,
|
|
||||||
&transa,
|
|
||||||
sizeof(transa))) ||
|
|
||||||
!check_cublas("set TRANSB",
|
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
|
||||||
CUBLASLT_MATMUL_DESC_TRANSB,
|
|
||||||
&transb,
|
|
||||||
sizeof(transb)))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (desc->needs_scalar_scale) {
|
|
||||||
float one = 1.0f;
|
|
||||||
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
|
||||||
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
|
||||||
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||||
|
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||||
|
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
|
||||||
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
cublasOperation_t transa = CUBLAS_OP_T;
|
||||||
if (!check_cublas("set A scale ptr",
|
cublasOperation_t transb = CUBLAS_OP_N;
|
||||||
|
if (!check_cublas("set TRANSA",
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
CUBLASLT_MATMUL_DESC_TRANSA,
|
||||||
&a_scale_ptr,
|
&transa,
|
||||||
sizeof(a_scale_ptr))) ||
|
sizeof(transa))) ||
|
||||||
!check_cublas("set B scale ptr",
|
!check_cublas("set TRANSB",
|
||||||
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
CUBLASLT_MATMUL_DESC_TRANSB,
|
||||||
&b_scale_ptr,
|
&transb,
|
||||||
sizeof(b_scale_ptr)))) {
|
sizeof(transb)))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
if (desc->needs_scalar_scale) {
|
||||||
|
float one = 1.0f;
|
||||||
|
if (!alloc_filled(cuda, &out->a_scale_dev, sizeof(one), 0x00) ||
|
||||||
|
!alloc_filled(cuda, &out->b_scale_dev, sizeof(one), 0x00)) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (!device_upload(cuda, out->a_scale_dev, &one, sizeof(one)) ||
|
||||||
|
!device_upload(cuda, out->b_scale_dev, &one, sizeof(one))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
void *a_scale_ptr = (void *)(uintptr_t)out->a_scale_dev;
|
||||||
|
void *b_scale_ptr = (void *)(uintptr_t)out->b_scale_dev;
|
||||||
|
if (!check_cublas("set A scale ptr",
|
||||||
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
|
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
|
||||||
|
&a_scale_ptr,
|
||||||
|
sizeof(a_scale_ptr))) ||
|
||||||
|
!check_cublas("set B scale ptr",
|
||||||
|
cublas->cublasLtMatmulDescSetAttribute(out->op_desc,
|
||||||
|
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
|
||||||
|
&b_scale_ptr,
|
||||||
|
sizeof(b_scale_ptr)))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
#if defined(CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3)
|
||||||
if (desc->needs_block_scale) {
|
if (desc->needs_block_scale) {
|
||||||
@@ -1023,78 +1089,94 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!check_cublas("create A layout",
|
if (!check_cublas("create A layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->a_layout, desc->a_type, out->k, out->m, out->k)) ||
|
||||||
!check_cublas("create B layout",
|
!check_cublas("create B layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->b_layout, desc->b_type, out->k, out->n, out->k)) ||
|
||||||
!check_cublas("create C layout",
|
!check_cublas("create C layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
cublas->cublasLtMatrixLayoutCreate(&out->c_layout, desc->c_type, out->m, out->n, out->m)) ||
|
||||||
!check_cublas("create D layout",
|
!check_cublas("create D layout",
|
||||||
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
cublas->cublasLtMatrixLayoutCreate(&out->d_layout, desc->d_type, out->m, out->n, out->m))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (out->workspace_size > 0) {
|
|
||||||
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!check_cublas("create preference", cublas->cublasLtMatmulPreferenceCreate(&out->preference))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (out->workspace_size > 0) {
|
||||||
|
if (!alloc_filled(cuda, &out->workspace_dev, out->workspace_size, 0x00)) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!check_cublas("set workspace",
|
||||||
|
cublas->cublasLtMatmulPreferenceSetAttribute(
|
||||||
|
out->preference,
|
||||||
|
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||||
|
&out->workspace_size,
|
||||||
|
sizeof(out->workspace_size)))) {
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int found = 0;
|
||||||
|
if (check_cublas("heuristic",
|
||||||
|
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
||||||
|
out->op_desc,
|
||||||
|
out->a_layout,
|
||||||
|
out->b_layout,
|
||||||
|
out->c_layout,
|
||||||
|
out->d_layout,
|
||||||
|
out->preference,
|
||||||
|
1,
|
||||||
|
&out->heuristic,
|
||||||
|
&found)) &&
|
||||||
|
found > 0) {
|
||||||
|
out->ready = 1;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
destroy_profile(cublas, cuda, out);
|
||||||
|
attempt_budget = round_down_size(attempt_budget * 3u / 4u, 256u);
|
||||||
|
if (attempt_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!check_cublas("set workspace",
|
return 0;
|
||||||
cublas->cublasLtMatmulPreferenceSetAttribute(
|
|
||||||
out->preference,
|
|
||||||
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
|
||||||
&out->workspace_size,
|
|
||||||
sizeof(out->workspace_size)))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int found = 0;
|
|
||||||
if (!check_cublas("heuristic",
|
|
||||||
cublas->cublasLtMatmulAlgoGetHeuristic(handle,
|
|
||||||
out->op_desc,
|
|
||||||
out->a_layout,
|
|
||||||
out->b_layout,
|
|
||||||
out->c_layout,
|
|
||||||
out->d_layout,
|
|
||||||
out->preference,
|
|
||||||
1,
|
|
||||||
&out->heuristic,
|
|
||||||
&found))) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (found <= 0) {
|
|
||||||
destroy_profile(cublas, cuda, out);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
out->ready = 1;
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||||
struct cublaslt_api *cublas,
|
struct cublaslt_api *cublas,
|
||||||
struct prepared_profile *profile) {
|
struct prepared_profile *profile) {
|
||||||
|
int32_t alpha_i32 = 1;
|
||||||
|
int32_t beta_i32 = 0;
|
||||||
|
double alpha_f64 = 1.0;
|
||||||
|
double beta_f64 = 0.0;
|
||||||
float alpha = 1.0f;
|
float alpha = 1.0f;
|
||||||
float beta = 0.0f;
|
float beta = 0.0f;
|
||||||
|
const void *alpha_ptr = α
|
||||||
|
const void *beta_ptr = β
|
||||||
|
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
|
||||||
|
alpha_ptr = &alpha_i32;
|
||||||
|
beta_ptr = &beta_i32;
|
||||||
|
} else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
|
||||||
|
alpha_ptr = &alpha_f64;
|
||||||
|
beta_ptr = &beta_f64;
|
||||||
|
}
|
||||||
return check_cublas(profile->desc.name,
|
return check_cublas(profile->desc.name,
|
||||||
cublas->cublasLtMatmul(handle,
|
cublas->cublasLtMatmul(handle,
|
||||||
profile->op_desc,
|
profile->op_desc,
|
||||||
&alpha,
|
alpha_ptr,
|
||||||
(const void *)(uintptr_t)profile->a_dev,
|
(const void *)(uintptr_t)profile->a_dev,
|
||||||
profile->a_layout,
|
profile->a_layout,
|
||||||
(const void *)(uintptr_t)profile->b_dev,
|
(const void *)(uintptr_t)profile->b_dev,
|
||||||
profile->b_layout,
|
profile->b_layout,
|
||||||
&beta,
|
beta_ptr,
|
||||||
(const void *)(uintptr_t)profile->c_dev,
|
(const void *)(uintptr_t)profile->c_dev,
|
||||||
profile->c_layout,
|
profile->c_layout,
|
||||||
(void *)(uintptr_t)profile->d_dev,
|
(void *)(uintptr_t)profile->d_dev,
|
||||||
@@ -1112,9 +1194,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int cc_minor,
|
int cc_minor,
|
||||||
int seconds,
|
int seconds,
|
||||||
int size_mb,
|
int size_mb,
|
||||||
|
const char *precision_filter,
|
||||||
struct stress_report *report) {
|
struct stress_report *report) {
|
||||||
struct cublaslt_api cublas;
|
struct cublaslt_api cublas;
|
||||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
|
||||||
cublasLtHandle_t handle = NULL;
|
cublasLtHandle_t handle = NULL;
|
||||||
CUcontext ctx = NULL;
|
CUcontext ctx = NULL;
|
||||||
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||||
@@ -1124,12 +1207,12 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int active = 0;
|
int active = 0;
|
||||||
int mp_count = 0;
|
int mp_count = 0;
|
||||||
int stream_count = 1;
|
int stream_count = 1;
|
||||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
int profile_count = PROFILE_COUNT;
|
||||||
int prepared_count = 0;
|
int prepared_count = 0;
|
||||||
int wave_launches = 0;
|
|
||||||
size_t requested_budget = 0;
|
size_t requested_budget = 0;
|
||||||
size_t total_budget = 0;
|
size_t total_budget = 0;
|
||||||
size_t per_profile_budget = 0;
|
size_t per_profile_budget = 0;
|
||||||
|
int budget_profiles = 0;
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||||
@@ -1150,8 +1233,9 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Count profiles matching the filter (for deciding what to run). */
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||||
planned++;
|
planned++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1162,18 +1246,42 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Count all profiles active on this GPU regardless of filter.
|
||||||
|
* Mixed phases still divide budget across the full precision set, while
|
||||||
|
* single-precision benchmark phases dedicate budget only to active
|
||||||
|
* profiles matching precision_filter. */
|
||||||
|
int planned_total = 0;
|
||||||
|
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||||
|
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
|
||||||
|
planned_total++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (planned_total < planned) {
|
||||||
|
planned_total = planned;
|
||||||
|
}
|
||||||
|
budget_profiles = planned_total;
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
budget_profiles = planned;
|
||||||
|
}
|
||||||
|
if (budget_profiles <= 0) {
|
||||||
|
budget_profiles = planned_total;
|
||||||
|
}
|
||||||
|
|
||||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
if (requested_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
requested_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
if (total_budget < (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
total_budget = (size_t)budget_profiles * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||||
cuda->cuStreamCreate &&
|
cuda->cuStreamCreate &&
|
||||||
cuda->cuStreamDestroy) {
|
cuda->cuStreamDestroy) {
|
||||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
stream_count = choose_stream_count(mp_count, budget_profiles, total_budget, 1);
|
||||||
|
}
|
||||||
|
if (precision_filter != NULL && stream_count > MAX_SINGLE_PRECISION_STREAMS) {
|
||||||
|
stream_count = MAX_SINGLE_PRECISION_STREAMS;
|
||||||
}
|
}
|
||||||
if (stream_count > 1) {
|
if (stream_count > 1) {
|
||||||
int created = 0;
|
int created = 0;
|
||||||
@@ -1186,19 +1294,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
report->stream_count = stream_count;
|
report->stream_count = stream_count;
|
||||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
per_profile_budget = total_budget / ((size_t)budget_profiles * (size_t)stream_count);
|
||||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
|
if (precision_filter != NULL) {
|
||||||
|
per_profile_budget = clamp_single_precision_profile_budget(per_profile_budget);
|
||||||
|
}
|
||||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d budget_profiles=%d per_worker_mb=%zu\n",
|
||||||
size_mb,
|
size_mb,
|
||||||
report->buffer_mb,
|
report->buffer_mb,
|
||||||
report->stream_count,
|
report->stream_count,
|
||||||
STRESS_LAUNCH_DEPTH,
|
|
||||||
mp_count,
|
mp_count,
|
||||||
|
budget_profiles,
|
||||||
per_profile_budget / (1024u * 1024u));
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
for (int i = 0; i < profile_count; i++) {
|
for (int i = 0; i < profile_count; i++) {
|
||||||
@@ -1211,6 +1322,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
desc->min_cc);
|
desc->min_cc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (!profile_allowed_for_run(desc, cc, precision_filter)) {
|
||||||
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"%s=SKIPPED benchmark_disabled\n",
|
||||||
|
desc->name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (int lane = 0; lane < stream_count; lane++) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
CUstream stream = streams[lane];
|
CUstream stream = streams[lane];
|
||||||
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||||
@@ -1246,50 +1364,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Keep the GPU queue continuously full by submitting kernels without
|
||||||
|
* synchronizing after every wave. A sync barrier after each small batch
|
||||||
|
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||||
|
* especially when individual kernels are short. Instead we sync at most
|
||||||
|
* once per second (for error detection) and once at the very end. */
|
||||||
double deadline = now_seconds() + (double)seconds;
|
double deadline = now_seconds() + (double)seconds;
|
||||||
|
double next_sync = now_seconds() + 1.0;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
wave_launches = 0;
|
int launched = 0;
|
||||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
int launched_this_batch = 0;
|
if (!prepared[i].ready) {
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
continue;
|
||||||
if (!prepared[i].ready) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
|
||||||
append_detail(report->details,
|
|
||||||
sizeof(report->details),
|
|
||||||
"%s=FAILED runtime\n",
|
|
||||||
prepared[i].desc.name);
|
|
||||||
for (int j = 0; j < prepared_count; j++) {
|
|
||||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
|
||||||
}
|
|
||||||
cublas.cublasLtDestroy(handle);
|
|
||||||
destroy_streams(cuda, streams, stream_count);
|
|
||||||
cuda->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
prepared[i].iterations++;
|
|
||||||
report->iterations++;
|
|
||||||
wave_launches++;
|
|
||||||
launched_this_batch++;
|
|
||||||
}
|
}
|
||||||
if (launched_this_batch <= 0) {
|
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||||
break;
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"%s=FAILED runtime\n",
|
||||||
|
prepared[i].desc.name);
|
||||||
|
for (int j = 0; j < prepared_count; j++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||||
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
prepared[i].iterations++;
|
||||||
|
report->iterations++;
|
||||||
|
launched++;
|
||||||
}
|
}
|
||||||
if (wave_launches <= 0) {
|
if (launched <= 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
double now = now_seconds();
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
if (now >= next_sync || now >= deadline) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||||
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
next_sync = now + 1.0;
|
||||||
destroy_streams(cuda, streams, stream_count);
|
|
||||||
cuda->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* Final drain — ensure all queued work finishes before we read results. */
|
||||||
|
cuda->cuCtxSynchronize();
|
||||||
|
|
||||||
for (int i = 0; i < prepared_count; i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
@@ -1323,10 +1446,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
|
||||||
|
printf("device=%s\n", report->device);
|
||||||
|
printf("device_index=%d\n", device_index);
|
||||||
|
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
|
||||||
|
printf("backend=%s\n", report->backend);
|
||||||
|
printf("duration_s=%d\n", seconds);
|
||||||
|
printf("buffer_mb=%d\n", report->buffer_mb);
|
||||||
|
printf("streams=%d\n", report->stream_count);
|
||||||
|
printf("iterations=%lu\n", report->iterations);
|
||||||
|
printf("checksum=%llu\n", (unsigned long long)report->checksum);
|
||||||
|
if (report->details[0] != '\0') {
|
||||||
|
printf("%s", report->details);
|
||||||
|
}
|
||||||
|
printf("status=OK\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int seconds = 5;
|
int seconds = 5;
|
||||||
int size_mb = 64;
|
int size_mb = 64;
|
||||||
int device_index = 0;
|
int device_index = 0;
|
||||||
|
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
||||||
|
const char *precision_plan = NULL;
|
||||||
|
const char *precision_plan_seconds = NULL;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||||
seconds = atoi(argv[++i]);
|
seconds = atoi(argv[++i]);
|
||||||
@@ -1334,8 +1476,16 @@ int main(int argc, char **argv) {
|
|||||||
size_mb = atoi(argv[++i]);
|
size_mb = atoi(argv[++i]);
|
||||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||||
device_index = atoi(argv[++i]);
|
device_index = atoi(argv[++i]);
|
||||||
|
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
||||||
|
precision_filter = argv[++i];
|
||||||
|
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
|
||||||
|
precision_plan = argv[++i];
|
||||||
|
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
|
||||||
|
precision_plan_seconds = argv[++i];
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
fprintf(stderr,
|
||||||
|
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
|
||||||
|
argv[0]);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1395,26 +1545,94 @@ int main(int argc, char **argv) {
|
|||||||
int ok = 0;
|
int ok = 0;
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
#if HAVE_CUBLASLT_HEADERS
|
||||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
|
if (precision_plan != NULL && precision_plan[0] != '\0') {
|
||||||
|
char *plan_copy = strdup(precision_plan);
|
||||||
|
char *plan_seconds_copy = NULL;
|
||||||
|
int phase_seconds[32] = {0};
|
||||||
|
int phase_seconds_count = 0;
|
||||||
|
int phase_ok = 0;
|
||||||
|
if (plan_copy == NULL) {
|
||||||
|
fprintf(stderr, "failed to allocate precision plan buffer\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
|
||||||
|
plan_seconds_copy = strdup(precision_plan_seconds);
|
||||||
|
if (plan_seconds_copy == NULL) {
|
||||||
|
free(plan_copy);
|
||||||
|
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
for (char *sec_token = strtok(plan_seconds_copy, ",");
|
||||||
|
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
|
||||||
|
sec_token = strtok(NULL, ",")) {
|
||||||
|
while (*sec_token == ' ' || *sec_token == '\t') {
|
||||||
|
sec_token++;
|
||||||
|
}
|
||||||
|
if (*sec_token == '\0') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
phase_seconds[phase_seconds_count++] = atoi(sec_token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int phase_idx = 0;
|
||||||
|
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
|
||||||
|
while (*token == ' ' || *token == '\t') {
|
||||||
|
token++;
|
||||||
|
}
|
||||||
|
if (*token == '\0') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const char *phase_name = token;
|
||||||
|
const char *phase_filter = token;
|
||||||
|
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
|
||||||
|
phase_filter = NULL;
|
||||||
|
}
|
||||||
|
int phase_duration = seconds;
|
||||||
|
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
|
||||||
|
phase_duration = phase_seconds[phase_idx];
|
||||||
|
}
|
||||||
|
printf("phase_begin=%s\n", phase_name);
|
||||||
|
fflush(stdout);
|
||||||
|
memset(&report, 0, sizeof(report));
|
||||||
|
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
|
||||||
|
if (ok) {
|
||||||
|
print_stress_report(&report, device_index, phase_duration);
|
||||||
|
phase_ok = 1;
|
||||||
|
} else {
|
||||||
|
printf("phase_error=%s\n", phase_name);
|
||||||
|
if (report.details[0] != '\0') {
|
||||||
|
printf("%s", report.details);
|
||||||
|
if (report.details[strlen(report.details) - 1] != '\n') {
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("status=FAILED\n");
|
||||||
|
}
|
||||||
|
printf("phase_end=%s\n", phase_name);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
free(plan_seconds_copy);
|
||||||
|
free(plan_copy);
|
||||||
|
return phase_ok ? 0 : 1;
|
||||||
|
}
|
||||||
|
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||||
#endif
|
#endif
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
if (precision_filter != NULL) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
|
||||||
|
precision_filter,
|
||||||
|
name,
|
||||||
|
cc_major,
|
||||||
|
cc_minor);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
int ptx_mb = size_mb;
|
||||||
|
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("device=%s\n", report.device);
|
print_stress_report(&report, device_index, seconds);
|
||||||
printf("device_index=%d\n", device_index);
|
|
||||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
|
||||||
printf("backend=%s\n", report.backend);
|
|
||||||
printf("duration_s=%d\n", seconds);
|
|
||||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
|
||||||
printf("streams=%d\n", report.stream_count);
|
|
||||||
printf("iterations=%lu\n", report.iterations);
|
|
||||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
|
||||||
if (report.details[0] != '\0') {
|
|
||||||
printf("%s", report.details);
|
|
||||||
}
|
|
||||||
printf("status=OK\n");
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
|
|||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$VARIANT" in
|
case "$VARIANT" in
|
||||||
nvidia|amd|nogpu|all) ;;
|
nvidia|nvidia-legacy|amd|nogpu|all) ;;
|
||||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
*) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dirs ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -156,6 +161,7 @@ run_variant() {
|
|||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-e BEE_REQUIRE_MEMTEST=1 \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
"${IMAGE_REF}" \
|
||||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||||
@@ -170,6 +176,7 @@ run_variant() {
|
|||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-e BEE_REQUIRE_MEMTEST=1 \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
"${IMAGE_REF}" \
|
||||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||||
@@ -180,6 +187,9 @@ case "$VARIANT" in
|
|||||||
nvidia)
|
nvidia)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
;;
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
run_variant nvidia-legacy
|
||||||
|
;;
|
||||||
amd)
|
amd)
|
||||||
run_variant amd
|
run_variant amd
|
||||||
;;
|
;;
|
||||||
@@ -188,6 +198,7 @@ case "$VARIANT" in
|
|||||||
;;
|
;;
|
||||||
all)
|
all)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
|
run_variant nvidia-legacy
|
||||||
run_variant amd
|
run_variant amd
|
||||||
run_variant nogpu
|
run_variant nogpu
|
||||||
;;
|
;;
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
|
||||||
#
|
#
|
||||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
|
||||||
|
# - open -> kernel-open/ sources from the .run installer
|
||||||
|
# - proprietary -> traditional proprietary kernel sources from the .run installer
|
||||||
#
|
#
|
||||||
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
||||||
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
||||||
@@ -17,10 +19,19 @@ set -e
|
|||||||
NVIDIA_VERSION="$1"
|
NVIDIA_VERSION="$1"
|
||||||
DIST_DIR="$2"
|
DIST_DIR="$2"
|
||||||
DEBIAN_KERNEL_ABI="$3"
|
DEBIAN_KERNEL_ABI="$3"
|
||||||
|
NVIDIA_FLAVOR="${4:-open}"
|
||||||
|
|
||||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
|
|
||||||
|
case "$NVIDIA_FLAVOR" in
|
||||||
|
open|proprietary) ;;
|
||||||
|
*)
|
||||||
|
echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
# On Debian, kernel headers are split into two packages:
|
# On Debian, kernel headers are split into two packages:
|
||||||
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
|||||||
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||||
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||||
|
|
||||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
|
||||||
|
|
||||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
|
||||||
echo "=== installing linux-headers-${KVER} ==="
|
|
||||||
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
|
||||||
"linux-headers-${KVER}" \
|
|
||||||
gcc make perl
|
|
||||||
fi
|
|
||||||
echo "kernel headers (arch): $KDIR_ARCH"
|
|
||||||
echo "kernel headers (common): $KDIR_COMMON"
|
|
||||||
|
|
||||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||||
CACHE_LAYOUT_VERSION="2"
|
CACHE_LAYOUT_VERSION="3"
|
||||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||||
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||||
|
echo "=== installing linux-headers-${KVER} ==="
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
"linux-headers-${KVER}" \
|
||||||
|
gcc make perl
|
||||||
|
fi
|
||||||
|
echo "kernel headers (arch): $KDIR_ARCH"
|
||||||
|
echo "kernel headers (common): $KDIR_COMMON"
|
||||||
|
|
||||||
# Download official NVIDIA .run installer with sha256 verification
|
# Download official NVIDIA .run installer with sha256 verification
|
||||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||||
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||||
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
|||||||
rm -rf "$EXTRACT_DIR"
|
rm -rf "$EXTRACT_DIR"
|
||||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||||
|
|
||||||
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
# Find kernel source directory for the selected flavor.
|
||||||
KERNEL_SRC=""
|
KERNEL_SRC=""
|
||||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
if [ "$NVIDIA_FLAVOR" = "open" ]; then
|
||||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
|
||||||
done
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
done
|
||||||
|
else
|
||||||
|
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||||
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||||
echo "kernel source: $KERNEL_SRC"
|
echo "kernel source: $KERNEL_SRC"
|
||||||
|
|
||||||
# Build kernel modules
|
# Build kernel modules
|
||||||
|
|||||||
@@ -15,28 +15,49 @@ DIST_DIR="${REPO_ROOT}/dist"
|
|||||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
|
BUILD_VARIANT="nvidia"
|
||||||
BEE_GPU_VENDOR="nvidia"
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
--variant) BUILD_VARIANT="$2"; shift 2 ;;
|
||||||
*) echo "unknown arg: $1"; exit 1 ;;
|
*) echo "unknown arg: $1"; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$BEE_GPU_VENDOR" in
|
case "$BUILD_VARIANT" in
|
||||||
nvidia|amd|nogpu) ;;
|
nvidia)
|
||||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="proprietary"
|
||||||
|
;;
|
||||||
|
amd)
|
||||||
|
BEE_GPU_VENDOR="amd"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
nogpu)
|
||||||
|
BEE_GPU_VENDOR="nogpu"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||||
|
|
||||||
export BEE_GPU_VENDOR
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
|
export MEMTEST_VERSION
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
@@ -105,6 +126,37 @@ resolve_iso_version() {
|
|||||||
resolve_audit_version
|
resolve_audit_version
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sync_builder_workdir() {
|
||||||
|
src_dir="$1"
|
||||||
|
dst_dir="$2"
|
||||||
|
|
||||||
|
mkdir -p "$dst_dir"
|
||||||
|
|
||||||
|
# Historical bug: old workdirs could keep config/bootloaders/grub-pc even
|
||||||
|
# after the source tree moved to grub-efi only. Remove bootloaders eagerly
|
||||||
|
# so reused workdirs cannot leak stale templates into a new ISO build.
|
||||||
|
rm -rf "$dst_dir/config/bootloaders"
|
||||||
|
|
||||||
|
rsync -a --delete \
|
||||||
|
--exclude='cache/' \
|
||||||
|
--exclude='chroot/' \
|
||||||
|
--exclude='.build/' \
|
||||||
|
--exclude='*.iso' \
|
||||||
|
--exclude='*.packages' \
|
||||||
|
--exclude='*.contents' \
|
||||||
|
--exclude='*.files' \
|
||||||
|
"$src_dir/" "$dst_dir/"
|
||||||
|
|
||||||
|
if [ ! -f "$dst_dir/config/bootloaders/grub-efi/grub.cfg" ]; then
|
||||||
|
echo "ERROR: staged workdir is missing config/bootloaders/grub-efi/grub.cfg" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ -e "$dst_dir/config/bootloaders/grub-pc" ]; then
|
||||||
|
echo "ERROR: stale config/bootloaders/grub-pc remained in staged workdir" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
iso_list_files() {
|
iso_list_files() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
|
|
||||||
@@ -182,7 +234,7 @@ dump_memtest_debug() {
|
|||||||
|
|
||||||
echo "-- source bootloader templates --"
|
echo "-- source bootloader templates --"
|
||||||
for cfg in \
|
for cfg in \
|
||||||
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
|
"${BUILDER_DIR}/config/bootloaders/grub-efi/grub.cfg" \
|
||||||
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
|
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
|
||||||
if [ -f "$cfg" ]; then
|
if [ -f "$cfg" ]; then
|
||||||
echo " file: $cfg"
|
echo " file: $cfg"
|
||||||
@@ -302,6 +354,12 @@ memtest_fail() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nvidia_runtime_fail() {
|
||||||
|
msg="$1"
|
||||||
|
echo "ERROR: ${msg}" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
iso_memtest_present() {
|
iso_memtest_present() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
iso_files="$(mktemp)"
|
iso_files="$(mktemp)"
|
||||||
@@ -439,6 +497,113 @@ validate_iso_memtest() {
|
|||||||
echo "=== memtest validation OK ==="
|
echo "=== memtest validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validate_iso_live_boot_entries() {
|
||||||
|
iso_path="$1"
|
||||||
|
echo "=== validating live boot entries in ISO ==="
|
||||||
|
|
||||||
|
[ -f "$iso_path" ] || {
|
||||||
|
echo "ERROR: ISO not found for live boot validation: $iso_path" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||||
|
echo "ERROR: ISO reader unavailable for live boot validation" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
grub_cfg="$(mktemp)"
|
||||||
|
isolinux_cfg="$(mktemp)"
|
||||||
|
|
||||||
|
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||||
|
echo "ERROR: failed to read boot/grub/grub.cfg from ISO" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: failed to read isolinux/live.cfg from ISO" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if grep -q '@APPEND_LIVE@\|@KERNEL_LIVE@\|@INITRD_LIVE@' "$grub_cfg" "$isolinux_cfg"; then
|
||||||
|
echo "ERROR: unresolved live-build placeholders remain in ISO bootloader config" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB toram entry is missing" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'linux .*boot=live ' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB live entry is missing boot=live" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'linux .*boot=live .*toram ' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB toram entry is missing boot=live or toram" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
grep -q 'append .*boot=live .*toram ' "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: isolinux toram entry is missing boot=live or toram" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
echo "=== live boot validation OK ==="
|
||||||
|
}
|
||||||
|
|
||||||
|
validate_iso_nvidia_runtime() {
|
||||||
|
iso_path="$1"
|
||||||
|
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||||
|
|
||||||
|
echo "=== validating NVIDIA runtime in ISO ==="
|
||||||
|
|
||||||
|
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
|
||||||
|
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
|
||||||
|
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
|
||||||
|
|
||||||
|
squashfs_tmp="$(mktemp)"
|
||||||
|
squashfs_list="$(mktemp)"
|
||||||
|
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
|
||||||
|
}
|
||||||
|
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
||||||
|
}
|
||||||
|
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
||||||
|
}
|
||||||
|
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
echo "=== NVIDIA runtime validation OK ==="
|
||||||
|
}
|
||||||
|
|
||||||
append_memtest_grub_entry() {
|
append_memtest_grub_entry() {
|
||||||
grub_cfg="$1"
|
grub_cfg="$1"
|
||||||
[ -f "$grub_cfg" ] || return 1
|
[ -f "$grub_cfg" ] || return 1
|
||||||
@@ -477,6 +642,185 @@ label memtest
|
|||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extract_live_grub_entry() {
|
||||||
|
cfg="$1"
|
||||||
|
live_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
live_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
[ -n "$live_linux" ] || return 1
|
||||||
|
[ -n "$live_initrd" ] || return 1
|
||||||
|
|
||||||
|
grub_kernel="$(printf '%s\n' "$live_linux" | awk '{print $2}')"
|
||||||
|
grub_append="$(printf '%s\n' "$live_linux" | cut -d' ' -f3-)"
|
||||||
|
grub_initrd="$(printf '%s\n' "$live_initrd" | awk '{print $2}')"
|
||||||
|
[ -n "$grub_kernel" ] || return 1
|
||||||
|
[ -n "$grub_append" ] || return 1
|
||||||
|
[ -n "$grub_initrd" ] || return 1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
load_live_build_append() {
|
||||||
|
lb_dir="$1"
|
||||||
|
binary_cfg="$lb_dir/config/binary"
|
||||||
|
[ -f "$binary_cfg" ] || return 1
|
||||||
|
|
||||||
|
# config/binary is generated by live-build and contains shell variable
|
||||||
|
# assignments such as LB_BOOTAPPEND_LIVE="boot=live ...".
|
||||||
|
# shellcheck disable=SC1090
|
||||||
|
. "$binary_cfg"
|
||||||
|
|
||||||
|
[ -n "${LB_BOOTAPPEND_LIVE:-}" ] || return 1
|
||||||
|
live_build_append="$LB_BOOTAPPEND_LIVE"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
extract_live_isolinux_entry() {
|
||||||
|
cfg="$1"
|
||||||
|
isolinux_linux="$(awk '/^[[:space:]]*linux[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
isolinux_initrd="$(awk '/^[[:space:]]*initrd[[:space:]]+\/live\// { print; exit }' "$cfg")"
|
||||||
|
isolinux_append="$(awk '/^[[:space:]]*append[[:space:]]+/ { sub(/^[[:space:]]*append[[:space:]]+/, ""); print; exit }' "$cfg")"
|
||||||
|
[ -n "$isolinux_linux" ] || return 1
|
||||||
|
[ -n "$isolinux_initrd" ] || return 1
|
||||||
|
[ -n "$isolinux_append" ] || return 1
|
||||||
|
|
||||||
|
isolinux_kernel="$(printf '%s\n' "$isolinux_linux" | awk '{print $2}')"
|
||||||
|
isolinux_initrd_path="$(printf '%s\n' "$isolinux_initrd" | awk '{print $2}')"
|
||||||
|
[ -n "$isolinux_kernel" ] || return 1
|
||||||
|
[ -n "$isolinux_initrd_path" ] || return 1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
write_canonical_grub_cfg() {
|
||||||
|
cfg="$1"
|
||||||
|
kernel="$2"
|
||||||
|
append_live="$3"
|
||||||
|
initrd="$4"
|
||||||
|
|
||||||
|
cat > "$cfg" <<EOF
|
||||||
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||||
|
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||||
|
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||||
|
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||||
|
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||||
|
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||||
|
echo " Hardware Audit LiveCD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
menuentry "EASY-BEE" {
|
||||||
|
linux ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||||
|
linux ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "UEFI Firmware Settings" {
|
||||||
|
fwsetup
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
write_canonical_isolinux_cfg() {
|
||||||
|
cfg="$1"
|
||||||
|
kernel="$2"
|
||||||
|
initrd="$3"
|
||||||
|
append_live="$4"
|
||||||
|
|
||||||
|
cat > "$cfg" <<EOF
|
||||||
|
label live-@FLAVOUR@-normal
|
||||||
|
menu label ^EASY-BEE
|
||||||
|
menu default
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-toram
|
||||||
|
menu label EASY-BEE (^load to RAM)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-gsp-off
|
||||||
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms
|
||||||
|
menu label EASY-BEE (^KMS, no nomodeset)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
|
menu label EASY-BEE (KMS, ^GSP=off)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-failsafe
|
||||||
|
menu label EASY-BEE (^fail-safe)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
enforce_live_build_bootloader_assets() {
|
||||||
|
lb_dir="$1"
|
||||||
|
grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
|
||||||
|
grub_dir="$lb_dir/binary/boot/grub"
|
||||||
|
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||||
|
|
||||||
|
if ! load_live_build_append "$lb_dir"; then
|
||||||
|
echo "bootloader sync: WARNING: could not load LB_BOOTAPPEND_LIVE from $lb_dir/config/binary" >&2
|
||||||
|
live_build_append=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$grub_cfg" ]; then
|
||||||
|
if extract_live_grub_entry "$grub_cfg"; then
|
||||||
|
mkdir -p "$grub_dir/live-theme"
|
||||||
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||||
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
||||||
|
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
||||||
|
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||||
|
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||||
|
else
|
||||||
|
echo "bootloader sync: WARNING: could not extract live entry from $grub_cfg" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$isolinux_cfg" ]; then
|
||||||
|
if extract_live_isolinux_entry "$isolinux_cfg"; then
|
||||||
|
write_canonical_isolinux_cfg "$isolinux_cfg" "$isolinux_kernel" "$isolinux_initrd_path" "${live_build_append:-$isolinux_append}"
|
||||||
|
echo "bootloader sync: rewrote binary/isolinux/live.cfg with canonical EASY-BEE menu"
|
||||||
|
else
|
||||||
|
echo "bootloader sync: WARNING: could not extract live entry from $isolinux_cfg" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
copy_memtest_from_deb() {
|
copy_memtest_from_deb() {
|
||||||
deb="$1"
|
deb="$1"
|
||||||
dst_boot="$2"
|
dst_boot="$2"
|
||||||
@@ -583,7 +927,7 @@ recover_iso_memtest() {
|
|||||||
|
|
||||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||||
mkdir -p "${OUT_DIR}"
|
mkdir -p "${OUT_DIR}"
|
||||||
@@ -711,6 +1055,7 @@ run_optional_step_sh() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
mkdir -p "${LOG_DIR}" 2>/dev/null || true
|
||||||
step_log="${LOG_DIR}/${step_slug}.log"
|
step_log="${LOG_DIR}/${step_slug}.log"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== optional step: ${step_name} ==="
|
echo "=== optional step: ${step_name} ==="
|
||||||
@@ -734,13 +1079,14 @@ start_build_log
|
|||||||
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
# install them on the fly so NVIDIA modules and ISO kernel always match.
|
||||||
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
if [ -z "${DEBIAN_KERNEL_ABI}" ] || [ "${DEBIAN_KERNEL_ABI}" = "auto" ]; then
|
||||||
echo "=== refreshing apt index to detect current kernel ABI ==="
|
echo "=== refreshing apt index to detect current kernel ABI ==="
|
||||||
apt-get update -qq
|
apt-get update -qq || echo "WARNING: apt-get update failed, trying cached index"
|
||||||
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
DEBIAN_KERNEL_ABI=$(apt-cache depends linux-image-amd64 2>/dev/null \
|
||||||
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
| awk '/Depends:.*linux-image-[0-9]/{print $2}' \
|
||||||
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
| grep -oE '[0-9]+\.[0-9]+\.[0-9]+-[0-9]+' \
|
||||||
| head -1)
|
| head -1)
|
||||||
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
if [ -z "${DEBIAN_KERNEL_ABI}" ]; then
|
||||||
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
echo "ERROR: could not auto-detect kernel ABI from apt-cache" >&2
|
||||||
|
echo "Hint: set DEBIAN_KERNEL_ABI=x.y.z-N in iso/builder/VERSIONS to skip auto-detection" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
echo "=== kernel ABI: ${DEBIAN_KERNEL_ABI} ==="
|
||||||
@@ -757,7 +1103,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
|||||||
apt-get install -y "linux-headers-${KVER}"
|
apt-get install -y "linux-headers-${KVER}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -809,9 +1155,37 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
|
|
||||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
|
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||||
|
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||||
|
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||||
|
if [ -n "$fp4_type_match" ]; then
|
||||||
|
echo "fp4_header_symbol=present"
|
||||||
|
echo "$fp4_type_match"
|
||||||
|
else
|
||||||
|
echo "fp4_header_symbol=missing"
|
||||||
|
fi
|
||||||
|
if [ -n "$fp4_scale_match" ]; then
|
||||||
|
echo "fp4_scale_mode_symbol=present"
|
||||||
|
echo "$fp4_scale_match"
|
||||||
|
else
|
||||||
|
echo "fp4_scale_mode_symbol=missing"
|
||||||
|
fi
|
||||||
|
|
||||||
GPU_STRESS_NEED_BUILD=1
|
GPU_STRESS_NEED_BUILD=1
|
||||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
if [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
GPU_STRESS_NEED_BUILD=0
|
GPU_STRESS_NEED_BUILD=0
|
||||||
|
for dep in \
|
||||||
|
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||||
|
"${BUILDER_DIR}/VERSIONS"; do
|
||||||
|
if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
|
GPU_STRESS_NEED_BUILD=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
|
||||||
|
find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
|
||||||
|
GPU_STRESS_NEED_BUILD=1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||||
@@ -825,21 +1199,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
else
|
else
|
||||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||||
fi
|
fi
|
||||||
|
echo "=== bee-gpu-burn compiled profile probe ==="
|
||||||
|
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
|
||||||
|
echo "fp4_profile_string=present"
|
||||||
|
else
|
||||||
|
echo "fp4_profile_string=missing"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
# Sync builder config into variant work dir, preserving lb cache.
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
rsync -a --delete \
|
sync_builder_workdir "${BUILDER_DIR}" "${BUILD_WORK_DIR}"
|
||||||
--exclude='cache/' \
|
|
||||||
--exclude='chroot/' \
|
|
||||||
--exclude='.build/' \
|
|
||||||
--exclude='*.iso' \
|
|
||||||
--exclude='*.packages' \
|
|
||||||
--exclude='*.contents' \
|
|
||||||
--exclude='*.files' \
|
|
||||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
|
||||||
|
|
||||||
# Share deb package cache across variants.
|
# Share deb package cache across variants.
|
||||||
# Restore: populate work dir cache from shared cache before build.
|
# Restore: populate work dir cache from shared cache before build.
|
||||||
@@ -937,10 +1309,10 @@ done
|
|||||||
# --- NVIDIA kernel modules and userspace libs ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
@@ -1011,13 +1383,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
|
|||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||||
|
NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
|
||||||
NCCL_VERSION=${NCCL_VERSION}
|
NCCL_VERSION=${NCCL_VERSION}
|
||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||||
@@ -1029,6 +1402,7 @@ fi
|
|||||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||||
|
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||||
BUILD_DATE=${BUILD_DATE}
|
BUILD_DATE=${BUILD_DATE}
|
||||||
GIT_COMMIT=${GIT_COMMIT}
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
@@ -1039,6 +1413,11 @@ EOF
|
|||||||
|
|
||||||
# Write GPU vendor marker for hooks
|
# Write GPU vendor marker for hooks
|
||||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||||
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
else
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
fi
|
||||||
|
|
||||||
# Patch motd with build info
|
# Patch motd with build info
|
||||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||||
@@ -1074,6 +1453,7 @@ fi
|
|||||||
# --- substitute version placeholders in package list and archive ---
|
# --- substitute version placeholders in package list and archive ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
sed -i \
|
sed -i \
|
||||||
|
-e "s/%%NVIDIA_FABRICMANAGER_VERSION%%/${NVIDIA_FABRICMANAGER_VERSION}/g" \
|
||||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
@@ -1109,17 +1489,25 @@ fi
|
|||||||
|
|
||||||
# --- build ISO using live-build ---
|
# --- build ISO using live-build ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||||
|
|
||||||
# Export for auto/config
|
# Export for auto/config
|
||||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||||
export BEE_GPU_VENDOR_UPPER
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
|
echo "=== enforcing canonical bootloader assets ==="
|
||||||
|
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||||
|
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||||
|
run_step_sh "rebuild live-build checksums after bootloader sync" "91b-lb-checksums" "lb binary_checksums 2>&1"
|
||||||
|
run_step_sh "rebuild ISO after bootloader sync" "91c-lb-binary-iso" "lb binary_iso 2>&1"
|
||||||
|
run_step_sh "rebuild zsync after bootloader sync" "91d-lb-zsync" "lb binary_zsync 2>&1"
|
||||||
|
|
||||||
# --- persist deb package cache back to shared location ---
|
# --- persist deb package cache back to shared location ---
|
||||||
# This allows the second variant to reuse all downloaded packages.
|
# This allows the second variant to reuse all downloaded packages.
|
||||||
@@ -1144,9 +1532,11 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
echo "=== done (${BUILD_VARIANT}) ==="
|
||||||
echo "ISO: $ISO_OUT"
|
echo "ISO: $ISO_OUT"
|
||||||
if command -v stat >/dev/null 2>&1; then
|
if command -v stat >/dev/null 2>&1; then
|
||||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ insmod serial
|
|||||||
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
serial --unit=0 --speed=115200 --word=8 --parity=no --stop=1
|
||||||
|
|
||||||
insmod gfxterm
|
insmod gfxterm
|
||||||
insmod png
|
|
||||||
|
|
||||||
source /boot/grub/theme.cfg
|
|
||||||
|
|
||||||
terminal_input console serial
|
terminal_input console serial
|
||||||
terminal_output gfxterm serial
|
terminal_output gfxterm serial
|
||||||
|
|
||||||
|
insmod png
|
||||||
|
source /boot/grub/theme.cfg
|
||||||
28
iso/builder/config/bootloaders/grub-efi/grub.cfg
Normal file
28
iso/builder/config/bootloaders/grub-efi/grub.cfg
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
|
menuentry "EASY-BEE" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE -- load to RAM (toram)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "UEFI Firmware Settings" {
|
||||||
|
fwsetup
|
||||||
|
}
|
||||||
|
fi
|
||||||
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
Normal file
BIN
iso/builder/config/bootloaders/grub-efi/live-theme/bee-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 78 KiB |
@@ -5,6 +5,13 @@ title-text: ""
|
|||||||
message-font: "Unifont Regular 16"
|
message-font: "Unifont Regular 16"
|
||||||
terminal-font: "Unifont Regular 16"
|
terminal-font: "Unifont Regular 16"
|
||||||
|
|
||||||
|
#bee logo - centered, upper third of screen
|
||||||
|
+ image {
|
||||||
|
top = 4%
|
||||||
|
left = 50%-200
|
||||||
|
file = "bee-logo.png"
|
||||||
|
}
|
||||||
|
|
||||||
#help bar at the bottom
|
#help bar at the bottom
|
||||||
+ label {
|
+ label {
|
||||||
top = 100%-50
|
top = 100%-50
|
||||||
@@ -21,17 +28,17 @@ terminal-font: "Unifont Regular 16"
|
|||||||
+ boot_menu {
|
+ boot_menu {
|
||||||
left = 20%
|
left = 20%
|
||||||
width = 60%
|
width = 60%
|
||||||
top = 62%
|
top = 65%
|
||||||
height = 38%-80
|
height = 35%-80
|
||||||
item_color = "#c88000"
|
item_color = "#c88000"
|
||||||
item_font = "Unifont Regular 16"
|
item_font = "Unifont Regular 16"
|
||||||
selected_item_color= "#f5a800"
|
selected_item_color= "#f5a800"
|
||||||
selected_item_font = "Unifont Regular 16"
|
selected_item_font = "Unifont Regular 16"
|
||||||
item_height = 16
|
item_height = 20
|
||||||
item_padding = 0
|
item_padding = 2
|
||||||
item_spacing = 4
|
item_spacing = 4
|
||||||
icon_width = 0
|
icon_width = 0
|
||||||
icon_heigh = 0
|
icon_height = 0
|
||||||
item_icon_space = 0
|
item_icon_space = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
9
iso/builder/config/bootloaders/grub-efi/theme.cfg
Normal file
9
iso/builder/config/bootloaders/grub-efi/theme.cfg
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
set color_normal=light-gray/black
|
||||||
|
set color_highlight=yellow/black
|
||||||
|
|
||||||
|
if [ -e /boot/grub/live-theme/theme.txt ]; then
|
||||||
|
set theme=/boot/grub/live-theme/theme.txt
|
||||||
|
else
|
||||||
|
set menu_color_normal=yellow/black
|
||||||
|
set menu_color_highlight=white/brown
|
||||||
|
fi
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
source /boot/grub/config.cfg
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
|
||||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
|
||||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
|
||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
|
||||||
menuentry "Memory Test (memtest86+)" {
|
|
||||||
chainloader /boot/memtest86+x64.efi
|
|
||||||
}
|
|
||||||
else
|
|
||||||
menuentry "Memory Test (memtest86+)" {
|
|
||||||
linux16 /boot/memtest86+x64.bin
|
|
||||||
}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
|
||||||
menuentry "UEFI Firmware Settings" {
|
|
||||||
fwsetup
|
|
||||||
}
|
|
||||||
fi
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
set color_normal=light-gray/black
|
|
||||||
set color_highlight=white/dark-gray
|
|
||||||
|
|
||||||
if [ -e /boot/grub/splash.png ]; then
|
|
||||||
set theme=/boot/grub/live-theme/theme.txt
|
|
||||||
else
|
|
||||||
set menu_color_normal=cyan/black
|
|
||||||
set menu_color_highlight=white/dark-gray
|
|
||||||
fi
|
|
||||||
@@ -3,37 +3,37 @@ label live-@FLAVOUR@-normal
|
|||||||
menu default
|
menu default
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms
|
|
||||||
menu label EASY-BEE (^graphics/KMS)
|
|
||||||
linux @LINUX@
|
|
||||||
initrd @INITRD@
|
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
|
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE (^load to RAM)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-gsp-off
|
label live-@FLAVOUR@-gsp-off
|
||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-kms-gsp-off
|
label live-@FLAVOUR@-kms
|
||||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
menu label EASY-BEE (^KMS, no nomodeset)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
|
append @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms-gsp-off
|
||||||
|
menu label EASY-BEE (KMS, ^GSP=off)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-failsafe
|
label live-@FLAVOUR@-failsafe
|
||||||
menu label EASY-BEE (^fail-safe)
|
menu label EASY-BEE (^fail-safe)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
label memtest
|
label memtest
|
||||||
menu label ^Memory Test (memtest86+)
|
menu label ^Memory Test (memtest86+)
|
||||||
|
|||||||
@@ -25,11 +25,14 @@ ensure_bee_console_user() {
|
|||||||
ensure_bee_console_user
|
ensure_bee_console_user
|
||||||
|
|
||||||
# Enable common bee services
|
# Enable common bee services
|
||||||
|
systemctl enable bee-hpc-tuning.service
|
||||||
systemctl enable bee-network.service
|
systemctl enable bee-network.service
|
||||||
systemctl enable bee-preflight.service
|
systemctl enable bee-preflight.service
|
||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
systemctl enable bee-sshsetup.service
|
systemctl enable bee-sshsetup.service
|
||||||
|
systemctl enable bee-selfheal.timer
|
||||||
|
systemctl enable bee-boot-status.service
|
||||||
systemctl enable ssh.service
|
systemctl enable ssh.service
|
||||||
systemctl enable lightdm.service 2>/dev/null || true
|
systemctl enable lightdm.service 2>/dev/null || true
|
||||||
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||||
@@ -40,6 +43,7 @@ systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
|||||||
# Enable GPU-vendor specific services
|
# Enable GPU-vendor specific services
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable nvidia-fabricmanager.service 2>/dev/null || true
|
||||||
systemctl enable bee-nvidia.service
|
systemctl enable bee-nvidia.service
|
||||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
@@ -53,11 +57,16 @@ fi
|
|||||||
# nogpu: no GPU services needed
|
# nogpu: no GPU services needed
|
||||||
|
|
||||||
# Ensure scripts are executable
|
# Ensure scripts are executable
|
||||||
|
chmod +x /usr/local/bin/bee-hpc-tuning 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
|
||||||
|
#
|
||||||
|
# live-boot tries "losetup --replace --direct-io=on" when re-associating the
|
||||||
|
# loop device to the RAM copy in /dev/shm. tmpfs does not support O_DIRECT,
|
||||||
|
# so the ioctl returns EINVAL and the verification step fails.
|
||||||
|
#
|
||||||
|
# The patch replaces the replace call so that if --direct-io=on fails it falls
|
||||||
|
# back to a plain replace without direct-io, and also relaxes the verification
|
||||||
|
# to a warning so the boot continues even when re-association is imperfect.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
|
||||||
|
|
||||||
|
if [ ! -f "${TORAM_SCRIPT}" ]; then
|
||||||
|
echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
# Replace any losetup --replace call that includes --direct-io=on with a
|
||||||
|
# version that first tries with direct-io, then retries without it.
|
||||||
|
#
|
||||||
|
# The sed expression turns:
|
||||||
|
# losetup --replace ... --direct-io=on LOOP FILE
|
||||||
|
# into a shell snippet that tries both, silently.
|
||||||
|
#
|
||||||
|
# We also downgrade the fatal "Task finished with error." block to a warning
|
||||||
|
# so the boot continues if re-association fails (squashfs still accessible).
|
||||||
|
|
||||||
|
# 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
|
||||||
|
sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||||
|
sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
# 2. Turn the hard error into a warning so boot continues.
|
||||||
|
# live-boot prints this exact string when verification fails.
|
||||||
|
sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
echo "9010-fix-toram: patch applied"
|
||||||
|
grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
|
||||||
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
46
iso/builder/config/hooks/normal/9011-toram-rsync.hook.chroot
Executable file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9011-toram-rsync.hook.chroot
|
||||||
|
#
|
||||||
|
# Adds rsync to the initramfs so that live-boot's toram code takes the
|
||||||
|
# rsync --progress path instead of the silent "cp -a" fallback.
|
||||||
|
#
|
||||||
|
# live-boot's 9990-toram-todisk.sh already contains:
|
||||||
|
# if [ -x /bin/rsync ]; then
|
||||||
|
# rsync -a --progress ... 1>/dev/console
|
||||||
|
# else
|
||||||
|
# cp -a ... # no output
|
||||||
|
# fi
|
||||||
|
#
|
||||||
|
# We install an initramfs-tools hook that calls copy_exec /usr/bin/rsync,
|
||||||
|
# which copies the binary + all shared-library dependencies into the initrd.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||||
|
HOOK="${HOOK_DIR}/bee-rsync"
|
||||||
|
|
||||||
|
mkdir -p "${HOOK_DIR}"
|
||||||
|
|
||||||
|
cat > "${HOOK}" << 'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
# initramfs hook: include rsync for live-boot toram progress output
|
||||||
|
PREREQ=""
|
||||||
|
prereqs() { echo "$PREREQ"; }
|
||||||
|
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||||
|
|
||||||
|
. /usr/share/initramfs-tools/hook-functions
|
||||||
|
|
||||||
|
if [ -x /usr/bin/rsync ]; then
|
||||||
|
copy_exec /usr/bin/rsync /bin
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod +x "${HOOK}"
|
||||||
|
|
||||||
|
echo "9011-toram-rsync: installed initramfs hook at ${HOOK}"
|
||||||
|
|
||||||
|
# Rebuild initramfs so the hook takes effect in the ISO's initrd.img
|
||||||
|
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||||
|
echo "9011-toram-rsync: rebuilding initramfs for kernel ${KVER}"
|
||||||
|
update-initramfs -u -k "${KVER}"
|
||||||
|
echo "9011-toram-rsync: done"
|
||||||
@@ -5,6 +5,8 @@ set -e
|
|||||||
|
|
||||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||||
|
|
||||||
|
# memtest86+ 6.x uses memtest86+.bin (no x64 suffix) for the BIOS binary,
|
||||||
|
# while 5.x used memtest86+x64.bin. We normalise both to x64 names in the ISO.
|
||||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||||
BINARY_BOOT_DIR="binary/boot"
|
BINARY_BOOT_DIR="binary/boot"
|
||||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||||
@@ -24,15 +26,23 @@ fail_or_warn() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# grub.cfg and live.cfg may not exist yet when binary hooks run — live-build
|
||||||
|
# creates them after this hook (lb binary_grub-efi / lb binary_syslinux).
|
||||||
|
# The template already has memtest entries hardcoded, so a missing config file
|
||||||
|
# here is not an error; validate_iso_memtest() checks the final ISO instead.
|
||||||
|
warn_only() {
|
||||||
|
log "WARNING: $1"
|
||||||
|
}
|
||||||
|
|
||||||
copy_memtest_file() {
|
copy_memtest_file() {
|
||||||
src="$1"
|
src="$1"
|
||||||
base="$(basename "$src")"
|
dst_name="${2:-$(basename "$src")}"
|
||||||
dst="${BINARY_BOOT_DIR}/${base}"
|
dst="${BINARY_BOOT_DIR}/${dst_name}"
|
||||||
|
|
||||||
[ -f "$src" ] || return 1
|
[ -f "$src" ] || return 1
|
||||||
mkdir -p "${BINARY_BOOT_DIR}"
|
mkdir -p "${BINARY_BOOT_DIR}"
|
||||||
cp "$src" "$dst"
|
cp "$src" "$dst"
|
||||||
log "copied ${base} from ${src}"
|
log "copied ${dst_name} from ${src}"
|
||||||
}
|
}
|
||||||
|
|
||||||
extract_memtest_from_deb() {
|
extract_memtest_from_deb() {
|
||||||
@@ -41,14 +51,44 @@ extract_memtest_from_deb() {
|
|||||||
|
|
||||||
log "extracting memtest payload from ${deb}"
|
log "extracting memtest payload from ${deb}"
|
||||||
dpkg-deb -x "$deb" "$tmpdir"
|
dpkg-deb -x "$deb" "$tmpdir"
|
||||||
for f in ${MEMTEST_FILES}; do
|
|
||||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
# EFI binary: both 5.x and 6.x use memtest86+x64.efi
|
||||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
if [ -f "${tmpdir}/boot/memtest86+x64.efi" ]; then
|
||||||
fi
|
copy_memtest_file "${tmpdir}/boot/memtest86+x64.efi"
|
||||||
done
|
fi
|
||||||
|
|
||||||
|
# BIOS binary: 5.x = memtest86+x64.bin, 6.x = memtest86+.bin
|
||||||
|
if [ -f "${tmpdir}/boot/memtest86+x64.bin" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/memtest86+x64.bin"
|
||||||
|
elif [ -f "${tmpdir}/boot/memtest86+.bin" ]; then
|
||||||
|
copy_memtest_file "${tmpdir}/boot/memtest86+.bin" "memtest86+x64.bin"
|
||||||
|
fi
|
||||||
|
|
||||||
rm -rf "$tmpdir"
|
rm -rf "$tmpdir"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
download_and_extract_memtest() {
|
||||||
|
tmpdl="$(mktemp -d)"
|
||||||
|
if [ -n "${MEMTEST_VERSION:-}" ]; then
|
||||||
|
pkg_spec="memtest86+=${MEMTEST_VERSION}"
|
||||||
|
else
|
||||||
|
pkg_spec="memtest86+"
|
||||||
|
fi
|
||||||
|
log "downloading ${pkg_spec} from apt"
|
||||||
|
if ! ( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ); then
|
||||||
|
log "apt download failed, retrying after apt-get update"
|
||||||
|
apt-get update -qq >/dev/null 2>&1 || true
|
||||||
|
( cd "$tmpdl" && apt-get download "$pkg_spec" 2>/dev/null ) || true
|
||||||
|
fi
|
||||||
|
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||||
|
if [ -n "$deb" ]; then
|
||||||
|
extract_memtest_from_deb "$deb"
|
||||||
|
else
|
||||||
|
log "apt download of memtest86+ failed"
|
||||||
|
fi
|
||||||
|
rm -rf "$tmpdl"
|
||||||
|
}
|
||||||
|
|
||||||
ensure_memtest_binaries() {
|
ensure_memtest_binaries() {
|
||||||
missing=0
|
missing=0
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
@@ -56,10 +96,15 @@ ensure_memtest_binaries() {
|
|||||||
done
|
done
|
||||||
[ "$missing" -eq 1 ] || return 0
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 1. Try files already placed by lb binary_memtest or chroot
|
||||||
for root in chroot/boot /boot; do
|
for root in chroot/boot /boot; do
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||||
done
|
done
|
||||||
|
# 6.x BIOS binary may lack x64 in name — copy with normalised name
|
||||||
|
if [ ! -f "${BINARY_BOOT_DIR}/memtest86+x64.bin" ]; then
|
||||||
|
copy_memtest_file "${root}/memtest86+.bin" "memtest86+x64.bin" || true
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
missing=0
|
missing=0
|
||||||
@@ -68,6 +113,7 @@ ensure_memtest_binaries() {
|
|||||||
done
|
done
|
||||||
[ "$missing" -eq 1 ] || return 0
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 2. Try apt package cache (may be empty if lb binary_memtest already purged)
|
||||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||||
[ -d "$root" ] || continue
|
[ -d "$root" ] || continue
|
||||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||||
@@ -76,6 +122,15 @@ ensure_memtest_binaries() {
|
|||||||
break
|
break
|
||||||
done
|
done
|
||||||
|
|
||||||
|
missing=0
|
||||||
|
for f in ${MEMTEST_FILES}; do
|
||||||
|
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||||
|
done
|
||||||
|
[ "$missing" -eq 1 ] || return 0
|
||||||
|
|
||||||
|
# 3. Fallback: download fresh from apt (lb binary_memtest purges the cache)
|
||||||
|
download_and_extract_memtest
|
||||||
|
|
||||||
missing=0
|
missing=0
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||||
@@ -88,7 +143,7 @@ ensure_memtest_binaries() {
|
|||||||
|
|
||||||
ensure_grub_entry() {
|
ensure_grub_entry() {
|
||||||
[ -f "$GRUB_CFG" ] || {
|
[ -f "$GRUB_CFG" ] || {
|
||||||
fail_or_warn "missing ${GRUB_CFG}"
|
warn_only "missing ${GRUB_CFG} (will be created by lb binary_grub-efi from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,7 +169,7 @@ EOF
|
|||||||
|
|
||||||
ensure_isolinux_entry() {
|
ensure_isolinux_entry() {
|
||||||
[ -f "$ISOLINUX_CFG" ] || {
|
[ -f "$ISOLINUX_CFG" ] || {
|
||||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
warn_only "missing ${ISOLINUX_CFG} (will be created by lb binary_syslinux from template)"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user