Compare commits
135 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1a5035edd | ||
|
|
8fc986c933 | ||
|
|
88b5e0edf2 | ||
|
|
82fe1f6d26 | ||
| 81e7c921f8 | |||
| 0fb8f2777f | |||
| bf182daa89 | |||
| 457ea1cf04 | |||
| bf6ecab4f0 | |||
| 02e44b1172 | |||
| 2ceaa0d0ca | |||
| 9482ba20a2 | |||
| 813e2f86a9 | |||
| 58a6da9b44 | |||
| f4a19c0a00 | |||
| 9e3dcf9b4d | |||
| 098e19f760 | |||
| e16d0f34b5 | |||
|
|
525ed8b8fc | ||
|
|
4f94ebcb2c | ||
|
|
05c1fde233 | ||
| 825ef6b98a | |||
| ba16021cdb | |||
|
|
bb1218ddd4 | ||
|
|
65faae8ede | ||
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b | ||
|
|
3f41a026ca | ||
|
|
0ee4f46537 | ||
| 8db40b098a | |||
| 16e7ae00e7 | |||
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d | |||
|
|
0a4bb596f6 | ||
|
|
531d1ca366 | ||
|
|
93cfa78e8c | ||
|
|
1358485f2b | ||
| 8fe20ba678 | |||
| d973231f37 | |||
| f5d175f488 | |||
| fa00667750 | |||
|
|
c7d2816a7f | ||
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 | |||
| 5b9015451e | |||
| d1a6863ceb | |||
| f9aa05de8e | |||
| a9ccea8cca | |||
| fc5c985fb5 | |||
| 5eb3baddb4 | |||
| a6ac13b5d3 | |||
| 4003cb7676 | |||
| 2875313ba0 | |||
| f1621efee4 | |||
| 4461249cc3 | |||
| e609fbbc26 | |||
| cc2b49ea41 | |||
| 33e0a5bef2 | |||
| 38e79143eb | |||
| 25af2df23a | |||
| 20abff7f90 | |||
| a14ec8631c | |||
| f58c7e58d3 | |||
| bf47c8dbd2 | |||
| 143b7dca5d | |||
| 9826d437a5 | |||
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 | ||
|
|
fc7fe0b08e | ||
|
|
3cf75a541a | ||
|
|
1f750d3edd | ||
|
|
b2b0444131 | ||
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
LISTEN ?= :8080
|
||||
AUDIT_PATH ?=
|
||||
EXPORT_DIR ?= $(CURDIR)/.tmp/export
|
||||
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||
|
||||
RUN_ARGS := web --listen $(LISTEN)
|
||||
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
|
||||
ifneq ($(AUDIT_PATH),)
|
||||
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||
endif
|
||||
@@ -9,10 +12,11 @@ endif
|
||||
.PHONY: run build test
|
||||
|
||||
run:
|
||||
go run ./cmd/bee $(RUN_ARGS)
|
||||
mkdir -p $(EXPORT_DIR)
|
||||
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||
|
||||
build:
|
||||
go build -o bee ./cmd/bee
|
||||
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||
|
||||
test:
|
||||
go test ./...
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -21,30 +22,7 @@ var Version = "dev"
|
||||
func buildLabel() string {
|
||||
label := strings.TrimSpace(Version)
|
||||
if label == "" {
|
||||
label = "dev"
|
||||
}
|
||||
if info, ok := debug.ReadBuildInfo(); ok {
|
||||
var revision string
|
||||
var modified bool
|
||||
for _, setting := range info.Settings {
|
||||
switch setting.Key {
|
||||
case "vcs.revision":
|
||||
revision = setting.Value
|
||||
case "vcs.modified":
|
||||
modified = setting.Value == "true"
|
||||
}
|
||||
}
|
||||
if revision != "" {
|
||||
short := revision
|
||||
if len(short) > 12 {
|
||||
short = short[:12]
|
||||
}
|
||||
label += " (" + short
|
||||
if modified {
|
||||
label += "+"
|
||||
}
|
||||
label += ")"
|
||||
}
|
||||
return "dev"
|
||||
}
|
||||
return label
|
||||
}
|
||||
@@ -53,10 +31,19 @@ func main() {
|
||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||
}
|
||||
|
||||
func run(args []string, stdout, stderr io.Writer) int {
|
||||
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: slog.LevelInfo,
|
||||
})))
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
slog.Error("fatal panic",
|
||||
"panic", fmt.Sprint(rec),
|
||||
"stack", string(debug.Stack()),
|
||||
)
|
||||
exitCode = 1
|
||||
}
|
||||
}()
|
||||
|
||||
if len(args) == 0 {
|
||||
printRootUsage(stderr)
|
||||
@@ -82,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark(args[1:], stdout, stderr)
|
||||
case "version", "--version", "-version":
|
||||
fmt.Fprintln(stdout, Version)
|
||||
return 0
|
||||
@@ -98,8 +87,9 @@ func printRootUsage(w io.Writer) {
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee export --target <device>
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -118,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||
case "version":
|
||||
fmt.Fprintln(stdout, "usage: bee version")
|
||||
return 0
|
||||
@@ -304,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
||||
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||
fs.Usage = func() {
|
||||
@@ -390,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||
case "storage":
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
@@ -407,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
slog.Info("sat archive written", "target", target, "path", archive)
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 2
|
||||
}
|
||||
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 0
|
||||
}
|
||||
target := args[0]
|
||||
if target != "nvidia" {
|
||||
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
|
||||
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||
return 2
|
||||
}
|
||||
|
||||
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
|
||||
devices := fs.String("devices", "", "comma-separated GPU indices to include")
|
||||
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
|
||||
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
|
||||
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
|
||||
if err := fs.Parse(args[1:]); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
|
||||
return 2
|
||||
}
|
||||
|
||||
includeIndices, err := parseBenchmarkIndexCSV(*devices)
|
||||
if err != nil {
|
||||
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
|
||||
return 2
|
||||
}
|
||||
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
|
||||
if err != nil {
|
||||
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
|
||||
Profile: *profile,
|
||||
SizeMB: *sizeMB,
|
||||
GPUIndices: includeIndices,
|
||||
ExcludeGPUIndices: excludeIndices,
|
||||
RunNCCL: !*skipNCCL,
|
||||
}, logLine)
|
||||
if err != nil {
|
||||
slog.Error("run benchmark", "target", target, "err", err)
|
||||
return 1
|
||||
}
|
||||
slog.Info("benchmark archive written", "target", target, "path", archive)
|
||||
return 0
|
||||
}
|
||||
|
||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
return nil, nil
|
||||
}
|
||||
var indices []int
|
||||
for _, part := range strings.Split(raw, ",") {
|
||||
part = strings.TrimSpace(part)
|
||||
if part == "" {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.Atoi(part)
|
||||
if err != nil || value < 0 {
|
||||
return nil, fmt.Errorf("bad gpu index %q", part)
|
||||
}
|
||||
indices = append(indices, value)
|
||||
}
|
||||
return indices, nil
|
||||
}
|
||||
|
||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunVersion(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
old := Version
|
||||
Version = "test-version"
|
||||
t.Cleanup(func() { Version = old })
|
||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||
old := Version
|
||||
Version = "1.2.3"
|
||||
t.Cleanup(func() { Version = old })
|
||||
|
||||
if got := buildLabel(); got != "1.2.3" {
|
||||
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunExportRequiresTarget(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -19,17 +19,18 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultExportDir = "/appdata/bee/export"
|
||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
|
||||
)
|
||||
|
||||
type App struct {
|
||||
@@ -40,6 +41,8 @@ type App struct {
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -80,6 +83,7 @@ type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
@@ -100,6 +104,10 @@ func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
@@ -107,9 +115,17 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||
ResetNvidiaGPU(index int) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
@@ -131,7 +147,7 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
@@ -140,19 +156,32 @@ func New(platform *platform.System) *App {
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
snap, err := readAuditSnapshot(auditJSON)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return schema.HardwareIngestRequest{}, err
|
||||
}
|
||||
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||
@@ -160,7 +189,8 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -175,10 +205,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
@@ -203,10 +230,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
|
||||
return "stdout", err
|
||||
case strings.HasPrefix(output, "file:"):
|
||||
path := strings.TrimPrefix(output, "file:")
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
@@ -276,6 +300,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -497,6 +524,15 @@ func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
@@ -509,10 +545,56 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBenchmarkBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
@@ -521,14 +603,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
@@ -553,14 +635,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
@@ -733,6 +815,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||
var body strings.Builder
|
||||
@@ -767,6 +850,7 @@ func (a *App) MainBanner() string {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ""
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
var lines []string
|
||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||
@@ -843,6 +927,41 @@ func bodyOr(body, fallback string) string {
|
||||
return body
|
||||
}
|
||||
|
||||
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||
// component-status DB so they are visible in the Hardware Summary card.
|
||||
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||
if db == nil || len(psus) == 0 {
|
||||
return
|
||||
}
|
||||
const source = "audit:ipmi"
|
||||
worstStatus := "OK"
|
||||
for _, psu := range psus {
|
||||
if psu.Status == nil {
|
||||
continue
|
||||
}
|
||||
slot := "?"
|
||||
if psu.Slot != nil {
|
||||
slot = *psu.Slot
|
||||
}
|
||||
st := *psu.Status
|
||||
detail := ""
|
||||
if psu.ErrorDescription != nil {
|
||||
detail = *psu.ErrorDescription
|
||||
}
|
||||
db.Record("psu:"+slot, source, st, detail)
|
||||
switch st {
|
||||
case "Critical":
|
||||
worstStatus = "Critical"
|
||||
case "Warning":
|
||||
if worstStatus != "Critical" {
|
||||
worstStatus = "Warning"
|
||||
}
|
||||
}
|
||||
}
|
||||
db.Record("psu:all", source, worstStatus, "")
|
||||
}
|
||||
|
||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
@@ -861,6 +980,12 @@ func latestSATSummaries() []string {
|
||||
prefix string
|
||||
}{
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
|
||||
@@ -120,15 +120,23 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
}
|
||||
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
listNvidiaGPUStatusesFn func() ([]platform.NvidiaGPUStatus, error)
|
||||
resetNvidiaGPUFn func(int) (string, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
@@ -139,6 +147,48 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaBenchmarkFn != nil {
|
||||
return f.runNvidiaBenchmarkFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaComputeFn != nil {
|
||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerFn != nil {
|
||||
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPulseFn != nil {
|
||||
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaBandwidthFn != nil {
|
||||
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaStressFn != nil {
|
||||
return f.runNvidiaStressFn(baseDir, opts)
|
||||
@@ -153,11 +203,25 @@ func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
if f.listNvidiaGPUStatusesFn != nil {
|
||||
return f.listNvidiaGPUStatusesFn()
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||
if f.resetNvidiaGPUFn != nil {
|
||||
return f.resetNvidiaGPUFn(index)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
@@ -478,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -516,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -579,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
@@ -660,13 +718,50 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
raw := `{
|
||||
"collected_at": "2026-03-15T10:00:00Z",
|
||||
"hardware": {
|
||||
"board": {"serial_number": "SRV123"},
|
||||
"storage": [
|
||||
{"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
|
||||
{"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
|
||||
],
|
||||
"pcie_devices": [
|
||||
{"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
|
||||
{"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
|
||||
]
|
||||
}
|
||||
}`
|
||||
|
||||
got, err := ApplySATOverlay([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("ApplySATOverlay error: %v", err)
|
||||
}
|
||||
text := string(got)
|
||||
if contains(text, "Virtual HDisk0") {
|
||||
t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
|
||||
}
|
||||
if contains(text, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
|
||||
}
|
||||
if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
|
||||
t.Fatalf("overlaid audit should keep real devices:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
exportDir := filepath.Join(tmp, "export")
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
@@ -698,6 +793,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
@@ -707,6 +803,36 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
if contains(hdr.Name, "/export/bee-audit.json") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read audit entry: %v", err)
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
for _, want := range []string{
|
||||
"/system/ip-link.txt",
|
||||
"/system/ip-link-stats.txt",
|
||||
"/system/kernel-aer-nvidia.txt",
|
||||
"/system/lspci-nvidia-bridges-vv.txt",
|
||||
"/system/pcie-aer-sysfs.txt",
|
||||
"/system/ethtool-info.txt",
|
||||
"/system/ethtool-link.txt",
|
||||
"/system/ethtool-module.txt",
|
||||
"/system/mstflint-query.txt",
|
||||
} {
|
||||
var found bool
|
||||
for _, name := range names {
|
||||
if contains(name, want) {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||
}
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
@@ -721,6 +847,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
@@ -734,6 +866,10 @@ func TestMainBanner(t *testing.T) {
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
memorySerialA := "DIMM-A"
|
||||
memorySerialB := "DIMM-B"
|
||||
storageSerialA := "DISK-A"
|
||||
storageSerialB := "DISK-B"
|
||||
gpuClass := "VideoController"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
@@ -749,12 +885,12 @@ func TestMainBanner(t *testing.T) {
|
||||
{Model: &cpuModel},
|
||||
},
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
|
||||
48
audit/internal/app/atomic_write.go
Normal file
48
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||
}
|
||||
|
||||
tmpPath := path + ".tmp"
|
||||
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||
}
|
||||
|
||||
success := false
|
||||
defer func() {
|
||||
_ = f.Close()
|
||||
if !success {
|
||||
_ = os.Remove(tmpPath)
|
||||
}
|
||||
}()
|
||||
|
||||
if _, err := f.Write(data); err != nil {
|
||||
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||
}
|
||||
if err := os.Rename(tmpPath, path); err != nil {
|
||||
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||
}
|
||||
|
||||
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||
_ = dir.Sync()
|
||||
_ = dir.Close()
|
||||
}
|
||||
|
||||
success = true
|
||||
return nil
|
||||
}
|
||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||
t.Fatalf("seed file: %v", err)
|
||||
}
|
||||
|
||||
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||
t.Fatalf("atomicWriteFile: %v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read final: %v", err)
|
||||
}
|
||||
if string(raw) != "new\n" {
|
||||
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||
a := &App{
|
||||
runtime: fakeRuntime{
|
||||
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||
return schema.RuntimeHealth{
|
||||
Status: "OK",
|
||||
ExportDir: exportDir,
|
||||
DriverReady: true,
|
||||
CUDAReady: true,
|
||||
}, nil
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got, err := a.RunRuntimePreflight("file:" + path)
|
||||
if err != nil {
|
||||
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||
}
|
||||
if got != path {
|
||||
t.Fatalf("path=%q want %q", got, path)
|
||||
}
|
||||
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read runtime file: %v", err)
|
||||
}
|
||||
var health schema.RuntimeHealth
|
||||
if err := json.Unmarshal(raw, &health); err != nil {
|
||||
t.Fatalf("json unmarshal: %v", err)
|
||||
}
|
||||
if health.Status != "OK" {
|
||||
t.Fatalf("status=%q want OK", health.Status)
|
||||
}
|
||||
}
|
||||
268
audit/internal/app/component_status_db.go
Normal file
268
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,268 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
|
||||
"amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
@@ -3,13 +3,14 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
@@ -18,6 +19,7 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok {
|
||||
applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary)
|
||||
applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir)
|
||||
}
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok {
|
||||
applyMemorySAT(snap.Memory, summary)
|
||||
@@ -28,6 +30,102 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type nvidiaPerGPUStatus struct {
|
||||
runStatus string
|
||||
reason string
|
||||
}
|
||||
|
||||
func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) {
|
||||
statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
for i := range devs {
|
||||
if devs[i].Telemetry == nil {
|
||||
continue
|
||||
}
|
||||
rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
idx, ok := telemetryInt(rawIdx)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
st, ok := statusByIndex[idx]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT"))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description)
|
||||
}
|
||||
}
|
||||
|
||||
func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) {
|
||||
matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
sort.Strings(matches)
|
||||
runDir := matches[len(matches)-1]
|
||||
summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return nil, "", false
|
||||
}
|
||||
summaryKV := parseKeyValueSummary(string(summaryRaw))
|
||||
runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"])
|
||||
files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt"))
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
out := make(map[int]nvidiaPerGPUStatus, len(files))
|
||||
for _, file := range files {
|
||||
raw, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
kv := parseKeyValueSummary(string(raw))
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[idx] = nvidiaPerGPUStatus{
|
||||
runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])),
|
||||
reason: strings.TrimSpace(kv["reason"]),
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return nil, "", false
|
||||
}
|
||||
return out, runAtUTC, true
|
||||
}
|
||||
|
||||
func telemetryInt(v any) (int, bool) {
|
||||
switch value := v.(type) {
|
||||
case int:
|
||||
return value, true
|
||||
case int32:
|
||||
return int(value), true
|
||||
case int64:
|
||||
return int(value), true
|
||||
case float64:
|
||||
return int(value), true
|
||||
case string:
|
||||
n, err := strconv.Atoi(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
@@ -174,6 +272,31 @@ func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) {
|
||||
if component == nil || satStatus == "" {
|
||||
return
|
||||
}
|
||||
current := strings.TrimSpace(ptrString(component.Status))
|
||||
newSeverity := statusSeverity(satStatus)
|
||||
currentSeverity := statusSeverity(current)
|
||||
if current == "" || current == "Unknown" || newSeverity > currentSeverity {
|
||||
mergeComponentStatus(component, changedAt, satStatus, description)
|
||||
return
|
||||
}
|
||||
if newSeverity == currentSeverity && strings.TrimSpace(description) != "" {
|
||||
component.Status = appStringPtr(satStatus)
|
||||
component.ErrorDescription = appStringPtr(description)
|
||||
if strings.TrimSpace(changedAt) != "" {
|
||||
component.StatusChangedAt = appStringPtr(changedAt)
|
||||
component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{
|
||||
Status: satStatus,
|
||||
ChangedAt: changedAt,
|
||||
Details: appStringPtr(description),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statusSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
@@ -206,6 +329,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
|
||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
@@ -53,9 +53,57 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyLatestSATStatusesMarksNvidiaGPUByPerGPUStatusFile(t *testing.T) {
|
||||
baseDir := t.TempDir()
|
||||
runDir := filepath.Join(baseDir, "gpu-nvidia-20260407-162123")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte("run_at_utc=2026-04-07T16:21:23Z\noverall_status=FAILED\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "gpu-1-status.txt"), []byte("gpu_index=1\ngpu_name=NVIDIA H100 PCIe\nrun_status=FAILED\nreason=GPU requires reset\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
class := "VideoController"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
bdf0 := "0000:4b:00.0"
|
||||
bdf1 := "0000:4f:00.0"
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf0,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 0},
|
||||
},
|
||||
{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
BDF: &bdf1,
|
||||
Telemetry: map[string]any{"nvidia_gpu_index": 1},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[1].Status == nil || *snap.PCIeDevices[1].Status != "Critical" {
|
||||
t.Fatalf("gpu1 status=%v want Critical", snap.PCIeDevices[1].Status)
|
||||
}
|
||||
if snap.PCIeDevices[1].ErrorDescription == nil || *snap.PCIeDevices[1].ErrorDescription != "GPU requires reset failed" {
|
||||
got := "<nil>"
|
||||
if snap.PCIeDevices[1].ErrorDescription != nil {
|
||||
got = *snap.PCIeDevices[1].ErrorDescription
|
||||
}
|
||||
t.Fatalf("gpu1 error=%q want per-gpu reason", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
|
||||
"bee-network.service",
|
||||
"bee-nvidia.service",
|
||||
"bee-preflight.service",
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
}
|
||||
|
||||
@@ -27,15 +29,176 @@ var supportBundleCommands = []struct {
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/lspci-nvidia-bridges-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
bridge=$(basename "$(readlink -f "/sys/bus/pci/devices/$gpu/.." 2>/dev/null)" 2>/dev/null)
|
||||
if [ -n "$bridge" ] && [ "$bridge" != "$gpu" ]; then
|
||||
echo
|
||||
echo "=== UPSTREAM $bridge for $gpu ==="
|
||||
lspci -s "$bridge" -vv 2>&1 || true
|
||||
fi
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no NVIDIA PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
class=$(cat "$d/class" 2>/dev/null)
|
||||
case "$class" in
|
||||
0x030000|0x030200) ;;
|
||||
*) continue ;;
|
||||
esac
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
{name: "system/pcie-aer-sysfs.txt", cmd: []string{"sh", "-c", `
|
||||
found=0
|
||||
for dev in /sys/bus/pci/devices/*; do
|
||||
[ -e "$dev" ] || continue
|
||||
bdf=$(basename "$dev")
|
||||
block=""
|
||||
for f in aer_dev_correctable aer_dev_fatal aer_dev_nonfatal aer_rootport_total_err_cor aer_rootport_total_err_fatal aer_rootport_total_err_nonfatal; do
|
||||
if [ -r "$dev/$f" ]; then
|
||||
if [ -z "$block" ]; then
|
||||
block=1
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
fi
|
||||
printf " %-30s %s\n" "$f" "$(cat "$dev/$f" 2>/dev/null)"
|
||||
fi
|
||||
done
|
||||
if [ -n "$block" ]; then
|
||||
echo
|
||||
fi
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no PCIe AER sysfs counters found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -i "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v ethtool >/dev/null 2>&1; then
|
||||
echo "ethtool not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/class/net/*; do
|
||||
[ -e "$path" ] || continue
|
||||
iface=$(basename "$path")
|
||||
[ "$iface" = "lo" ] && continue
|
||||
found=1
|
||||
echo "=== $iface ==="
|
||||
ethtool -m "$iface" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no interfaces found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v mstflint >/dev/null 2>&1; then
|
||||
echo "mstflint not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for path in /sys/bus/pci/devices/*; do
|
||||
[ -e "$path/vendor" ] || continue
|
||||
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x15b3" ] || continue
|
||||
bdf=$(basename "$path")
|
||||
found=1
|
||||
echo "=== $bdf ==="
|
||||
mstflint -d "$bdf" q 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no Mellanox/NVIDIA networking devices found"
|
||||
fi
|
||||
`}},
|
||||
}
|
||||
|
||||
var supportBundleOptionalFiles = []struct {
|
||||
name string
|
||||
src string
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
@@ -48,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
host := sanitizeFilename(hostnameOr("unknown"))
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -75,45 +243,79 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
|
||||
func cleanupOldSupportBundles(dir string) error {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
type entry struct {
|
||||
path string
|
||||
mod time.Time
|
||||
entries := supportBundleEntries(matches)
|
||||
for path, mod := range entries {
|
||||
if time.Since(mod) > 24*time.Hour {
|
||||
_ = os.Remove(path)
|
||||
delete(entries, path)
|
||||
}
|
||||
}
|
||||
list := make([]entry, 0, len(matches))
|
||||
ordered := orderSupportBundles(entries)
|
||||
if len(ordered) > 3 {
|
||||
for _, old := range ordered[3:] {
|
||||
_ = os.Remove(old)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func latestSupportBundlePath(dir string) (string, error) {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||
if len(ordered) == 0 {
|
||||
return "", os.ErrNotExist
|
||||
}
|
||||
return ordered[0], nil
|
||||
}
|
||||
|
||||
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||
entries := make(map[string]time.Time, len(matches))
|
||||
for _, match := range matches {
|
||||
info, err := os.Stat(match)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
||||
_ = os.Remove(match)
|
||||
continue
|
||||
}
|
||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
||||
entries[match] = info.ModTime()
|
||||
}
|
||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
||||
if len(list) > 3 {
|
||||
for _, old := range list[3:] {
|
||||
_ = os.Remove(old.path)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||
ordered := make([]string, 0, len(entries))
|
||||
for path := range entries {
|
||||
ordered = append(ordered, path)
|
||||
}
|
||||
return nil
|
||||
sort.Slice(ordered, func(i, j int) bool {
|
||||
return entries[ordered[i]].After(entries[ordered[j]])
|
||||
})
|
||||
return ordered
|
||||
}
|
||||
|
||||
func writeJournalDump(dst string) error {
|
||||
@@ -152,6 +354,24 @@ func writeCommandOutput(dst string, cmd []string) error {
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func copyOptionalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
@@ -188,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||
}
|
||||
|
||||
func bundleVersion() string {
|
||||
v := buildVersion()
|
||||
v = strings.TrimPrefix(v, "v")
|
||||
v = strings.TrimPrefix(v, "V")
|
||||
if v == "" || v == "unknown" {
|
||||
return "0.0"
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func serverModelForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Product Name" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return strings.ReplaceAll(val, " ", "_")
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func serverSerialForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Serial Number" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return val
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func buildVersion() string {
|
||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||
if err != nil {
|
||||
@@ -215,7 +489,7 @@ func copyDirContents(srcDir, dstDir string) error {
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
@@ -227,7 +501,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
|
||||
}
|
||||
|
||||
func normalizeSupportBundleAuditJSON(path string) error {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
normalized, err := ApplySATOverlay(data)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return os.WriteFile(path, normalized, 0644)
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
package collector
|
||||
|
||||
import "bee/audit/internal/schema"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
finalizeSnapshot(snap, collectedAt)
|
||||
}
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
snap.Storage = filterStorage(snap.Storage)
|
||||
snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
|
||||
continue
|
||||
}
|
||||
out = append(out, disk)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
out := make([]schema.HardwarePCIeDevice, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
|
||||
continue
|
||||
}
|
||||
out = append(out, dev)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
|
||||
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
present := true
|
||||
status := statusOK
|
||||
serial := "SN-1"
|
||||
virtualModel := "Virtual HDisk1"
|
||||
realModel := "PASCARI"
|
||||
coProcessorClass := "Co-processor"
|
||||
gpuClass := "VideoController"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Memory: []schema.HardwareMemory{
|
||||
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PowerSupplies: []schema.HardwarePowerSupply{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||
}
|
||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||
}
|
||||
if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
|
||||
t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
|
||||
}
|
||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||
}
|
||||
|
||||
@@ -2,18 +2,21 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
mstflintQuery = func(bdf string) (string, error) {
|
||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -21,7 +24,7 @@ var (
|
||||
}
|
||||
|
||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -29,6 +32,14 @@ var (
|
||||
}
|
||||
|
||||
netIfacesByBDF = listNetIfacesByBDF
|
||||
readNetCarrierFile = func(iface string) (string, error) {
|
||||
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return strings.TrimSpace(string(raw)), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||
@@ -162,3 +173,9 @@ func listNetIfacesByBDF(bdf string) []string {
|
||||
}
|
||||
return ifaces
|
||||
}
|
||||
|
||||
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
return exec.CommandContext(ctx, name, args...).Output()
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
|
||||
var (
|
||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
||||
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -113,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
}
|
||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case key == "identifier":
|
||||
s := parseSFPIdentifier(val)
|
||||
dev.SFPIdentifier = &s
|
||||
t := true
|
||||
dev.SFPPresent = &t
|
||||
changed = true
|
||||
case key == "connector":
|
||||
s := parseSFPConnector(val)
|
||||
dev.SFPConnector = &s
|
||||
changed = true
|
||||
case key == "vendor name":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPVendor = &s
|
||||
changed = true
|
||||
case key == "vendor pn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPPartNumber = &s
|
||||
changed = true
|
||||
case key == "vendor sn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPSerialNumber = &s
|
||||
changed = true
|
||||
case strings.Contains(key, "laser wavelength"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPWavelengthNM = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "module temperature"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPTemperatureC = &f
|
||||
@@ -145,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
return changed
|
||||
}
|
||||
|
||||
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||
func parseSFPIdentifier(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||
// e.g. "0x07 (LC)" → "LC".
|
||||
func parseSFPConnector(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||
|
||||
func extractParens(s string) string {
|
||||
m := parenRe.FindStringSubmatch(s)
|
||||
if len(m) < 2 {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(m[1])
|
||||
}
|
||||
|
||||
func parseSFPDOM(raw string) map[string]any {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := map[string]any{}
|
||||
if dev.SFPPresent != nil {
|
||||
out["sfp_present"] = *dev.SFPPresent
|
||||
}
|
||||
if dev.SFPIdentifier != nil {
|
||||
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||
}
|
||||
if dev.SFPConnector != nil {
|
||||
out["sfp_connector"] = *dev.SFPConnector
|
||||
}
|
||||
if dev.SFPVendor != nil {
|
||||
out["sfp_vendor"] = *dev.SFPVendor
|
||||
}
|
||||
if dev.SFPPartNumber != nil {
|
||||
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||
}
|
||||
if dev.SFPSerialNumber != nil {
|
||||
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||
}
|
||||
if dev.SFPWavelengthNM != nil {
|
||||
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||
}
|
||||
if dev.SFPTemperatureC != nil {
|
||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||
}
|
||||
|
||||
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
queryPCILSPCIDetail = origDetail
|
||||
readPCIVPDFile = origVPD
|
||||
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
}
|
||||
return "aa:bb:cc:dd:ee:ff", nil
|
||||
}
|
||||
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||
|
||||
@@ -101,6 +104,39 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||
origIfaces := netIfacesByBDF
|
||||
origReadMAC := readNetAddressFile
|
||||
origEth := ethtoolInfoQuery
|
||||
origModule := ethtoolModuleQuery
|
||||
origCarrier := readNetCarrierFile
|
||||
t.Cleanup(func() {
|
||||
netIfacesByBDF = origIfaces
|
||||
readNetAddressFile = origReadMAC
|
||||
ethtoolInfoQuery = origEth
|
||||
ethtoolModuleQuery = origModule
|
||||
readNetCarrierFile = origCarrier
|
||||
})
|
||||
|
||||
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
BDF: &bdf,
|
||||
}}
|
||||
|
||||
out := enrichPCIeWithNICTelemetry(devs)
|
||||
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDBMValue(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
|
||||
@@ -13,14 +13,20 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
Index int
|
||||
BDF string
|
||||
Name string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
@@ -68,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
continue
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(info.Name); v != "" {
|
||||
devs[i].Model = &v
|
||||
}
|
||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||
devs[i].SerialNumber = &v
|
||||
}
|
||||
@@ -94,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -118,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 14 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -128,14 +137,20 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
Index: parseRequiredInt(rec[0]),
|
||||
BDF: bdf,
|
||||
Name: strings.TrimSpace(rec[2]),
|
||||
Serial: strings.TrimSpace(rec[3]),
|
||||
VBIOS: strings.TrimSpace(rec[4]),
|
||||
TemperatureC: parseMaybeFloat(rec[5]),
|
||||
PowerW: parseMaybeFloat(rec[6]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||
HWSlowdown: parseMaybeBool(rec[9]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -167,6 +182,30 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseRequiredInt(v string) int {
|
||||
n, err := strconv.Atoi(strings.TrimSpace(v))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -216,6 +255,10 @@ func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
dev.Telemetry["nvidia_gpu_index"] = info.Index
|
||||
if info.TemperatureC != nil {
|
||||
dev.TemperatureC = info.TemperatureC
|
||||
}
|
||||
@@ -231,4 +274,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatalf("gpu by normalized bdf not found")
|
||||
}
|
||||
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||
t.Fatalf("name: got %q", gpu.Name)
|
||||
}
|
||||
if gpu.Serial != "GPU-SERIAL-1" {
|
||||
t.Fatalf("serial: got %q", gpu.Serial)
|
||||
}
|
||||
@@ -28,6 +31,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
@@ -80,6 +89,9 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||
}
|
||||
if out[0].Telemetry == nil || out[0].Telemetry["nvidia_gpu_index"] != 0 {
|
||||
t.Fatalf("telemetry nvidia_gpu_index: got %#v", out[0].Telemetry)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||
t.Fatalf("status: got %v", out[0].Status)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
@@ -59,6 +60,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
"host bridge",
|
||||
"isa bridge",
|
||||
"pci bridge",
|
||||
"co-processor",
|
||||
"performance counter",
|
||||
"performance counters",
|
||||
"ram memory",
|
||||
@@ -78,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||
bmcPatterns := []string{
|
||||
"management system chip",
|
||||
"management controller",
|
||||
"ibmc",
|
||||
"idrac",
|
||||
"ilo vga",
|
||||
"aspeed",
|
||||
"matrox",
|
||||
}
|
||||
for _, bad := range bmcPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
@@ -152,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
}
|
||||
|
||||
@@ -221,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
||||
// Returns 0 for unrecognised values so comparisons fail safe.
|
||||
func pcieLinkSpeedRank(gen string) int {
|
||||
switch gen {
|
||||
case "Gen1":
|
||||
return 1
|
||||
case "Gen2":
|
||||
return 2
|
||||
case "Gen3":
|
||||
return 3
|
||||
case "Gen4":
|
||||
return 4
|
||||
case "Gen5":
|
||||
return 5
|
||||
case "Gen6":
|
||||
return 6
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func normalizePCILinkSpeed(raw string) string {
|
||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||
switch {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -19,6 +20,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "co-processor", class: "Co-processor", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
@@ -28,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
@@ -76,6 +80,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersCoProcessors(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "H100" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
@@ -124,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
|
||||
ptr := func(s string) *string { return &s }
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
linkSpeed *string
|
||||
maxSpeed *string
|
||||
wantWarning bool
|
||||
wantGenIn string // substring expected in ErrorDescription when warning
|
||||
}{
|
||||
{
|
||||
name: "degraded Gen1 vs Gen5",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen1",
|
||||
},
|
||||
{
|
||||
name: "at max Gen5",
|
||||
linkSpeed: ptr("Gen5"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "degraded Gen4 vs Gen5",
|
||||
linkSpeed: ptr("Gen4"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen4",
|
||||
},
|
||||
{
|
||||
name: "missing current speed — no warning",
|
||||
linkSpeed: nil,
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "missing max speed — no warning",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: nil,
|
||||
wantWarning: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
ok := statusOK
|
||||
dev.Status = &ok
|
||||
dev.LinkSpeed = tt.linkSpeed
|
||||
dev.MaxLinkSpeed = tt.maxSpeed
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
gotWarn := dev.Status != nil && *dev.Status == statusWarning
|
||||
if gotWarn != tt.wantWarning {
|
||||
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
|
||||
}
|
||||
if tt.wantWarning {
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("expected ErrorDescription to be set")
|
||||
}
|
||||
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
|
||||
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
|
||||
}
|
||||
} else {
|
||||
if dev.ErrorDescription != nil {
|
||||
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
if isVirtualBMCDisk(dev) {
|
||||
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||
continue
|
||||
}
|
||||
disks = append(disks, dev)
|
||||
}
|
||||
return disks
|
||||
}
|
||||
|
||||
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||
// These have zero reported size, a generic fake serial, and a model name that
|
||||
// starts with "Virtual HDisk".
|
||||
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||
return isVirtualHDiskModel(dev.Model)
|
||||
}
|
||||
|
||||
func isVirtualHDiskModel(model string) bool {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
return strings.HasPrefix(model, "virtual hdisk")
|
||||
}
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
|
||||
2257
audit/internal/platform/benchmark.go
Normal file
2257
audit/internal/platform/benchmark.go
Normal file
File diff suppressed because it is too large
Load Diff
385
audit/internal/platform/benchmark_report.go
Normal file
385
audit/internal/platform/benchmark_report.go
Normal file
@@ -0,0 +1,385 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||
return renderBenchmarkReportWithCharts(result)
|
||||
}
|
||||
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
|
||||
// ── Header ────────────────────────────────────────────────────────────────
|
||||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||
|
||||
// System identity block
|
||||
if result.ServerModel != "" {
|
||||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||
}
|
||||
if result.Hostname != "" {
|
||||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||
}
|
||||
// GPU models summary
|
||||
if len(result.GPUs) > 0 {
|
||||
modelCount := make(map[string]int)
|
||||
var modelOrder []string
|
||||
for _, g := range result.GPUs {
|
||||
m := strings.TrimSpace(g.Name)
|
||||
if m == "" {
|
||||
m = "Unknown GPU"
|
||||
}
|
||||
if modelCount[m] == 0 {
|
||||
modelOrder = append(modelOrder, m)
|
||||
}
|
||||
modelCount[m]++
|
||||
}
|
||||
var parts []string
|
||||
for _, m := range modelOrder {
|
||||
if modelCount[m] == 1 {
|
||||
parts = append(parts, m)
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.RampStep > 0 && result.RampTotal > 0 {
|
||||
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
|
||||
if result.RampRunID != "" {
|
||||
fmt.Fprintf(&b, "**Ramp-up run ID:** %s \n", result.RampRunID)
|
||||
}
|
||||
} else if result.ParallelGPUs {
|
||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||
}
|
||||
if result.ScalabilityScore > 0 {
|
||||
fmt.Fprintf(&b, "**Scalability score:** %.1f%% \n", result.ScalabilityScore)
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||
if len(result.Findings) > 0 {
|
||||
b.WriteString("## Executive Summary\n\n")
|
||||
for _, finding := range result.Findings {
|
||||
fmt.Fprintf(&b, "- %s\n", finding)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
if len(result.Warnings) > 0 {
|
||||
b.WriteString("## Warnings\n\n")
|
||||
for _, warning := range result.Warnings {
|
||||
fmt.Fprintf(&b, "- %s\n", warning)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── Methodology ───────────────────────────────────────────────────────────
|
||||
b.WriteString("## Methodology\n\n")
|
||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
|
||||
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
|
||||
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
||||
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
||||
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
||||
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
||||
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
||||
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
||||
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
|
||||
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
||||
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
||||
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
||||
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
|
||||
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
|
||||
b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
|
||||
|
||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||
b.WriteString("## Scorecard\n\n")
|
||||
b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||
b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
}
|
||||
interconnect := "-"
|
||||
if gpu.Scores.InterconnectScore > 0 {
|
||||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||
gpu.Index, name,
|
||||
gpu.Status,
|
||||
gpu.Scores.CompositeScore,
|
||||
gpu.Scores.ComputeScore,
|
||||
synthetic,
|
||||
mixed,
|
||||
mixedEff,
|
||||
topsPerSM,
|
||||
gpu.Scores.PowerSustainScore,
|
||||
gpu.Scores.ThermalSustainScore,
|
||||
gpu.Scores.StabilityScore,
|
||||
interconnect,
|
||||
)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
}
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||
|
||||
// Identity
|
||||
if gpu.BusID != "" {
|
||||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||
}
|
||||
if gpu.VBIOS != "" {
|
||||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||
}
|
||||
if gpu.ComputeCapability != "" {
|
||||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||
}
|
||||
if gpu.MultiprocessorCount > 0 {
|
||||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||
}
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString("\n")
|
||||
|
||||
// Per-precision stability phases.
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
b.WriteString("**Per-precision stability:**\n\n")
|
||||
b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
eccCorr := "—"
|
||||
eccUncorr := "—"
|
||||
if !p.ECC.IsZero() {
|
||||
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
|
||||
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||||
p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||||
eccCorr, eccUncorr)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
// Legacy: show combined-window variance.
|
||||
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
|
||||
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
|
||||
}
|
||||
|
||||
// ECC summary
|
||||
if !gpu.ECC.IsZero() {
|
||||
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
|
||||
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
|
||||
}
|
||||
|
||||
// Throttle
|
||||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||
if throttle != "none" {
|
||||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||
}
|
||||
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
||||
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
||||
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Degradation / Notes
|
||||
if len(gpu.DegradationReasons) > 0 {
|
||||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||
}
|
||||
if len(gpu.Notes) > 0 {
|
||||
b.WriteString("**Notes:**\n\n")
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||
if result.Interconnect != nil {
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(result.Interconnect.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server Power (IPMI)\n\n")
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
} else {
|
||||
b.WriteString("| | Value |\n|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(sp.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Cooling ───────────────────────────────────────────────────────────────
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
if cooling.Available {
|
||||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
||||
if cooling.FanDutyCycleAvailable {
|
||||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
||||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
||||
} else {
|
||||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
||||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||
}
|
||||
for _, note := range cooling.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(cooling.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
|
||||
if result.Interconnect != nil {
|
||||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||
// duration is unknown (0), raw seconds are shown instead.
|
||||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||
type counter struct {
|
||||
label string
|
||||
us uint64
|
||||
}
|
||||
counters := []counter{
|
||||
{"sw_power", t.SWPowerCapUS},
|
||||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||
{"sync_boost", t.SyncBoostUS},
|
||||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||
}
|
||||
var parts []string
|
||||
for _, c := range counters {
|
||||
if c.us == 0 {
|
||||
continue
|
||||
}
|
||||
sec := float64(c.us) / 1e6
|
||||
if steadyDurationSec > 0 {
|
||||
pct := sec / steadyDurationSec * 100
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||
} else if sec < 1 {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||
}
|
||||
}
|
||||
if len(parts) == 0 {
|
||||
return "none"
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||||
var best float64
|
||||
for i, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||||
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||||
if i == 0 || gpu.Scores.CompositeScore > best {
|
||||
best = gpu.Scores.CompositeScore
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||||
if result.Interconnect != nil {
|
||||
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||||
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
245
audit/internal/platform/benchmark_test.go
Normal file
245
audit/internal/platform/benchmark_test.go
Normal file
@@ -0,0 +1,245 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
profile string
|
||||
want benchmarkProfileSpec
|
||||
}{
|
||||
{
|
||||
name: "default",
|
||||
profile: "",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
||||
},
|
||||
{
|
||||
name: "stability",
|
||||
profile: "stability",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
||||
},
|
||||
{
|
||||
name: "overnight",
|
||||
profile: "overnight",
|
||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := resolveBenchmarkProfile(tc.profile)
|
||||
if got != tc.want {
|
||||
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
|
||||
Profile: "stability",
|
||||
RunNCCL: false,
|
||||
})
|
||||
if opts.Profile != NvidiaBenchmarkProfileStability {
|
||||
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
|
||||
}
|
||||
if opts.RunNCCL {
|
||||
t.Fatalf("RunNCCL should stay false when explicitly disabled")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := strings.Join([]string{
|
||||
"loader=bee-gpu-burn",
|
||||
"[gpu 0] device=NVIDIA H100",
|
||||
"[gpu 0] compute_capability=9.0",
|
||||
"[gpu 0] backend=cublasLt",
|
||||
"[gpu 0] duration_s=10",
|
||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||
"[gpu 0] fp16_tensor_iterations=200",
|
||||
"[gpu 0] fp8_e4m3_iterations=50",
|
||||
"[gpu 0] status=OK",
|
||||
}, "\n")
|
||||
|
||||
got := parseBenchmarkBurnLog(raw)
|
||||
if got.Backend != "cublasLt" {
|
||||
t.Fatalf("backend=%q want cublasLt", got.Backend)
|
||||
}
|
||||
if got.ComputeCapability != "9.0" {
|
||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||
}
|
||||
if len(got.Profiles) != 2 {
|
||||
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
||||
}
|
||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||
}
|
||||
if got.Profiles[1].Category != "fp8" {
|
||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
result := NvidiaBenchmarkResult{
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "PARTIAL",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "partial",
|
||||
},
|
||||
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
|
||||
GPUs: []BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100",
|
||||
Status: "OK",
|
||||
Steady: BenchmarkTelemetrySummary{
|
||||
AvgPowerW: 680,
|
||||
AvgTempC: 79,
|
||||
AvgGraphicsClockMHz: 1725,
|
||||
P95PowerW: 700,
|
||||
P95TempC: 82,
|
||||
P95GraphicsClockMHz: 1800,
|
||||
},
|
||||
Scores: BenchmarkScorecard{
|
||||
ComputeScore: 1200,
|
||||
PowerSustainScore: 96,
|
||||
ThermalSustainScore: 88,
|
||||
StabilityScore: 92,
|
||||
CompositeScore: 1176,
|
||||
},
|
||||
PrecisionResults: []BenchmarkPrecisionResult{
|
||||
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
|
||||
},
|
||||
Throttle: BenchmarkThrottleCounters{
|
||||
SWPowerCapUS: 1000000,
|
||||
},
|
||||
DegradationReasons: []string{"power_capped"},
|
||||
},
|
||||
},
|
||||
Cooling: &BenchmarkCoolingSummary{
|
||||
Available: true,
|
||||
AvgFanRPM: 9200,
|
||||
FanDutyCycleAvailable: true,
|
||||
AvgFanDutyCyclePct: 47.5,
|
||||
P95FanDutyCyclePct: 62.0,
|
||||
},
|
||||
}
|
||||
|
||||
report := renderBenchmarkReport(result)
|
||||
for _, needle := range []string{
|
||||
"Executive Summary",
|
||||
"GPU 0 spent measurable time under SW power cap.",
|
||||
"1176.00",
|
||||
"fp16_tensor",
|
||||
"700.00",
|
||||
"Cooling",
|
||||
"Average fan duty cycle",
|
||||
"47.5%",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
report := renderBenchmarkReport(NvidiaBenchmarkResult{
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "OK",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "full",
|
||||
},
|
||||
})
|
||||
|
||||
for _, needle := range []string{
|
||||
"gpu-metrics.csv",
|
||||
"gpu-metrics.html",
|
||||
"gpu-burn.log",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Clocks
|
||||
Graphics : 2422 MHz
|
||||
Memory : 12481 MHz
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
SM : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
Video : 2107 MHz
|
||||
|
||||
GPU 00000000:4F:00.0
|
||||
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||
Max Clocks
|
||||
Graphics : 2430 MHz
|
||||
Memory : 12481 MHz
|
||||
`)
|
||||
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0"},
|
||||
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
|
||||
}
|
||||
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvsmiQ := []byte(`
|
||||
GPU 00000000:4E:00.0
|
||||
Max Clocks
|
||||
Graphics : 9999 MHz
|
||||
Memory : 9999 MHz
|
||||
`)
|
||||
// Already populated — must not be overwritten.
|
||||
infoByIndex := map[int]benchmarkGPUInfo{
|
||||
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
||||
}
|
||||
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||
|
||||
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||
}
|
||||
}
|
||||
247
audit/internal/platform/benchmark_types.go
Normal file
247
audit/internal/platform/benchmark_types.go
Normal file
@@ -0,0 +1,247 @@
|
||||
package platform
|
||||
|
||||
import "time"
|
||||
|
||||
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||
type BenchmarkHostConfig struct {
|
||||
CPUModel string `json:"cpu_model,omitempty"`
|
||||
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||
CPUCores int `json:"cpu_cores,omitempty"`
|
||||
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||
type BenchmarkCPULoad struct {
|
||||
AvgPct float64 `json:"avg_pct"`
|
||||
MaxPct float64 `json:"max_pct"`
|
||||
P95Pct float64 `json:"p95_pct"`
|
||||
Samples int `json:"samples"`
|
||||
// Status is "ok", "high", or "unstable".
|
||||
Status string `json:"status"`
|
||||
Note string `json:"note,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
|
||||
// benchmark run.
|
||||
type BenchmarkCoolingSummary struct {
|
||||
Available bool `json:"available"`
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaBenchmarkProfileStandard = "standard"
|
||||
NvidiaBenchmarkProfileStability = "stability"
|
||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||
)
|
||||
|
||||
type NvidiaBenchmarkOptions struct {
|
||||
Profile string
|
||||
SizeMB int
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
RunNCCL bool
|
||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||
RampTotal int // total number of ramp-up steps in this run
|
||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||
}
|
||||
|
||||
type NvidiaBenchmarkResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
RampStep int `json:"ramp_step,omitempty"`
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
ScalabilityScore float64 `json:"scalability_score,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalizationGPU struct {
|
||||
Index int `json:"index"`
|
||||
PersistenceMode string `json:"persistence_mode,omitempty"`
|
||||
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
|
||||
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
|
||||
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
|
||||
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkGPUResult struct {
|
||||
Index int `json:"index"`
|
||||
UUID string `json:"uuid,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
VBIOS string `json:"vbios,omitempty"`
|
||||
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
// CalibratedPeakPowerW is the p95 power measured during a short
|
||||
// dcgmi targeted_power calibration run before the main benchmark.
|
||||
// Used as the reference denominator for PowerSustainScore instead of
|
||||
// the hardware default limit, which bee-gpu-burn cannot reach.
|
||||
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
|
||||
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||
// ECC error delta accumulated over the full benchmark (all phases combined).
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||
Scores BenchmarkScorecard `json:"scores"`
|
||||
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkTelemetrySummary struct {
|
||||
DurationSec float64 `json:"duration_sec"`
|
||||
Samples int `json:"samples"`
|
||||
AvgTempC float64 `json:"avg_temp_c"`
|
||||
P95TempC float64 `json:"p95_temp_c"`
|
||||
AvgPowerW float64 `json:"avg_power_w"`
|
||||
P95PowerW float64 `json:"p95_power_w"`
|
||||
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
|
||||
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
|
||||
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
|
||||
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
|
||||
AvgUsagePct float64 `json:"avg_usage_pct"`
|
||||
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
|
||||
ClockCVPct float64 `json:"clock_cv_pct"`
|
||||
PowerCVPct float64 `json:"power_cv_pct"`
|
||||
TempCVPct float64 `json:"temp_cv_pct"`
|
||||
ClockDriftPct float64 `json:"clock_drift_pct"`
|
||||
}
|
||||
|
||||
type BenchmarkThrottleCounters struct {
|
||||
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
|
||||
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
|
||||
SyncBoostUS uint64 `json:"sync_boost_us"`
|
||||
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
|
||||
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||
}
|
||||
|
||||
// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
|
||||
// Corrected = single-bit errors fixed by ECC (DRAM degradation).
|
||||
// Uncorrected = double-bit errors that could not be corrected (serious fault).
|
||||
// Both are volatile (since last driver reset), not persistent.
|
||||
type BenchmarkECCCounters struct {
|
||||
Corrected uint64 `json:"corrected"`
|
||||
Uncorrected uint64 `json:"uncorrected"`
|
||||
}
|
||||
|
||||
func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
|
||||
func (e BenchmarkECCCounters) IsZero() bool { return e.Corrected == 0 && e.Uncorrected == 0 }
|
||||
|
||||
type BenchmarkPrecisionResult struct {
|
||||
Name string `json:"name"`
|
||||
Category string `json:"category"`
|
||||
Supported bool `json:"supported"`
|
||||
Lanes int `json:"lanes,omitempty"`
|
||||
M uint64 `json:"m,omitempty"`
|
||||
N uint64 `json:"n,omitempty"`
|
||||
K uint64 `json:"k,omitempty"`
|
||||
Iterations uint64 `json:"iterations,omitempty"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
// Weight is the fp32-equivalence factor for this precision category.
|
||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
|
||||
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||
Weight float64 `json:"weight,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkScorecard struct {
|
||||
ComputeScore float64 `json:"compute_score"`
|
||||
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
|
||||
// steady phases (each precision ran alone, full GPU dedicated).
|
||||
SyntheticScore float64 `json:"synthetic_score,omitempty"`
|
||||
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
|
||||
// (all precisions competing simultaneously — closer to real workloads).
|
||||
MixedScore float64 `json:"mixed_score,omitempty"`
|
||||
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
|
||||
// sustains throughput under concurrent mixed-precision load.
|
||||
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
|
||||
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||
// over-reporting its power consumption.
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"`
|
||||
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
|
||||
// during a dedicated single-precision steady window. Because only one kernel
|
||||
// type runs at a time the PowerCVPct here is a genuine stability signal.
|
||||
type BenchmarkPrecisionSteadyPhase struct {
|
||||
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
|
||||
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||
// ECC errors accumulated during this precision phase only.
|
||||
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
|
||||
// Any uncorrected = serious fault triggered by this precision workload.
|
||||
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkInterconnectResult struct {
|
||||
Status string `json:"status"`
|
||||
Attempted bool `json:"attempted"`
|
||||
Supported bool `json:"supported"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
|
||||
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
|
||||
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
|
||||
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
|
||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
@@ -13,19 +13,24 @@ import (
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
Stage string `json:"stage,omitempty"`
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
|
||||
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
|
||||
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
@@ -46,7 +51,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 6 {
|
||||
if len(parts) < 7 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
@@ -57,6 +62,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
MemUsagePct: parseGPUFloat(parts[3]),
|
||||
PowerW: parseGPUFloat(parts[4]),
|
||||
ClockMHz: parseGPUFloat(parts[5]),
|
||||
MemClockMHz: parseGPUFloat(parts[6]),
|
||||
})
|
||||
}
|
||||
return rows, nil
|
||||
@@ -139,14 +145,24 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||
var b bytes.Buffer
|
||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
||||
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
|
||||
for _, r := range rows {
|
||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
||||
dutyAvail := 0
|
||||
if r.FanDutyCycleAvailable {
|
||||
dutyAvail = 1
|
||||
}
|
||||
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
|
||||
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
|
||||
}
|
||||
return os.WriteFile(path, b.Bytes(), 0644)
|
||||
}
|
||||
|
||||
type gpuMetricStageSpan struct {
|
||||
Name string
|
||||
Start float64
|
||||
End float64
|
||||
}
|
||||
|
||||
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
|
||||
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
// Group by GPU index preserving order.
|
||||
@@ -161,9 +177,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
stageSpans := buildGPUMetricStageSpans(rows)
|
||||
stageColorByName := make(map[string]string, len(stageSpans))
|
||||
for i, span := range stageSpans {
|
||||
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
|
||||
}
|
||||
|
||||
var legend strings.Builder
|
||||
if len(stageSpans) > 0 {
|
||||
legend.WriteString(`<div class="stage-legend">`)
|
||||
for _, span := range stageSpans {
|
||||
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
|
||||
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
|
||||
}
|
||||
legend.WriteString(`</div>`)
|
||||
}
|
||||
|
||||
var svgs strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
|
||||
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
|
||||
svgs.WriteString("\n")
|
||||
}
|
||||
|
||||
@@ -173,21 +205,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
|
||||
<meta charset="utf-8">
|
||||
<title>GPU Stress Test Metrics</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
|
||||
h1 { text-align: center; color: #333; margin: 0 0 8px; }
|
||||
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
|
||||
*{box-sizing:border-box}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
|
||||
.page{padding:24px}
|
||||
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
|
||||
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
|
||||
.card-body{padding:16px}
|
||||
h1{font-size:22px;margin:0 0 6px}
|
||||
p{color:var(--muted);font-size:13px;margin:0 0 16px}
|
||||
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
|
||||
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
|
||||
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
|
||||
.chart-block{margin-top:16px}
|
||||
</style>
|
||||
</head><body>
|
||||
<div class="page">
|
||||
<div class="card">
|
||||
<div class="card-head">GPU Stress Test Metrics</div>
|
||||
<div class="card-body">
|
||||
<h1>GPU Stress Test Metrics</h1>
|
||||
<p>Generated %s</p>
|
||||
%s
|
||||
</body></html>`, ts, svgs.String())
|
||||
<div class="chart-block">%s</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`, ts, legend.String(), svgs.String())
|
||||
|
||||
return os.WriteFile(path, []byte(html), 0644)
|
||||
}
|
||||
|
||||
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
|
||||
// Layout
|
||||
const W, H = 960, 520
|
||||
const plotX1 = 120 // usage axis / chart left border
|
||||
@@ -197,7 +247,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
const PW = plotX2 - plotX1
|
||||
const PH = plotY2 - plotY1
|
||||
// Outer axes
|
||||
const tempAxisX = 60 // temp axis line
|
||||
const tempAxisX = 60 // temp axis line
|
||||
const clockAxisX = 900 // clock axis line
|
||||
|
||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||
@@ -282,6 +332,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
}
|
||||
b.WriteString("</g>\n")
|
||||
|
||||
// Stage backgrounds
|
||||
for _, span := range stageSpans {
|
||||
x1 := xv(span.Start)
|
||||
x2 := xv(span.End)
|
||||
if x2 < x1 {
|
||||
x1, x2 = x2, x1
|
||||
}
|
||||
if x2-x1 < 1 {
|
||||
x2 = x1 + 1
|
||||
}
|
||||
color := stageColorByName[span.Name]
|
||||
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
|
||||
x1, plotY1, x2-x1, PH, color)
|
||||
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
|
||||
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
|
||||
}
|
||||
|
||||
// Chart border
|
||||
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
|
||||
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
|
||||
@@ -380,224 +447,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const (
|
||||
ansiRed = "\033[31m"
|
||||
ansiBlue = "\033[34m"
|
||||
ansiGreen = "\033[32m"
|
||||
ansiYellow = "\033[33m"
|
||||
ansiReset = "\033[0m"
|
||||
)
|
||||
|
||||
const (
|
||||
termChartWidth = 70
|
||||
termChartHeight = 12
|
||||
)
|
||||
|
||||
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
|
||||
// Used in SAT stress-test logs.
|
||||
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
seen := make(map[int]bool)
|
||||
var order []int
|
||||
gpuMap := make(map[int][]GPUMetricRow)
|
||||
for _, r := range rows {
|
||||
if !seen[r.GPUIndex] {
|
||||
seen[r.GPUIndex] = true
|
||||
order = append(order, r.GPUIndex)
|
||||
}
|
||||
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
|
||||
}
|
||||
|
||||
type seriesDef struct {
|
||||
caption string
|
||||
color string
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for _, gpuIdx := range order {
|
||||
gr := gpuMap[gpuIdx]
|
||||
if len(gr) == 0 {
|
||||
continue
|
||||
}
|
||||
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
|
||||
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
|
||||
for _, d := range defs {
|
||||
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
|
||||
termChartHeight, termChartWidth))
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimRight(b.String(), "\n")
|
||||
}
|
||||
|
||||
// renderLineChart draws a single time-series line chart using box-drawing characters.
|
||||
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
|
||||
func renderLineChart(vals []float64, color, caption string, height, width int) string {
|
||||
if len(vals) == 0 {
|
||||
return caption + "\n"
|
||||
}
|
||||
|
||||
mn, mx := gpuMinMax(vals)
|
||||
if mn == mx {
|
||||
mx = mn + 1
|
||||
}
|
||||
|
||||
// Use the smaller of width or len(vals) to avoid stretching sparse data.
|
||||
w := width
|
||||
if len(vals) < w {
|
||||
w = len(vals)
|
||||
}
|
||||
data := gpuDownsample(vals, w)
|
||||
|
||||
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
|
||||
row := make([]int, w)
|
||||
for i, v := range data {
|
||||
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
|
||||
if r < 0 {
|
||||
r = 0
|
||||
}
|
||||
if r > height {
|
||||
r = height
|
||||
}
|
||||
row[i] = r
|
||||
}
|
||||
|
||||
// Fill the character grid.
|
||||
grid := make([][]rune, height+1)
|
||||
for i := range grid {
|
||||
grid[i] = make([]rune, w)
|
||||
for j := range grid[i] {
|
||||
grid[i][j] = ' '
|
||||
}
|
||||
}
|
||||
for x := 0; x < w; x++ {
|
||||
r := row[x]
|
||||
if x == 0 {
|
||||
grid[r][0] = '─'
|
||||
continue
|
||||
}
|
||||
p := row[x-1]
|
||||
switch {
|
||||
case r == p:
|
||||
grid[r][x] = '─'
|
||||
case r < p: // value went up (row index decreased toward top)
|
||||
grid[r][x] = '╭'
|
||||
grid[p][x] = '╯'
|
||||
for y := r + 1; y < p; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
default: // r > p, value went down
|
||||
grid[p][x] = '╮'
|
||||
grid[r][x] = '╰'
|
||||
for y := p + 1; y < r; y++ {
|
||||
grid[y][x] = '│'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Y axis tick labels.
|
||||
ticks := gpuNiceTicks(mn, mx, height/2)
|
||||
tickAtRow := make(map[int]string)
|
||||
labelWidth := 4
|
||||
for _, t := range ticks {
|
||||
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
|
||||
if r < 0 || r > height {
|
||||
continue
|
||||
}
|
||||
s := gpuFormatTick(t)
|
||||
tickAtRow[r] = s
|
||||
if len(s) > labelWidth {
|
||||
labelWidth = len(s)
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
for r := 0; r <= height; r++ {
|
||||
label := tickAtRow[r]
|
||||
fmt.Fprintf(&b, "%*s", labelWidth, label)
|
||||
switch {
|
||||
case label != "":
|
||||
b.WriteRune('┤')
|
||||
case r == height:
|
||||
b.WriteRune('┼')
|
||||
default:
|
||||
b.WriteRune('│')
|
||||
}
|
||||
b.WriteString(color)
|
||||
b.WriteString(string(grid[r]))
|
||||
b.WriteString(ansiReset)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
// Bottom axis.
|
||||
b.WriteString(strings.Repeat(" ", labelWidth))
|
||||
b.WriteRune('└')
|
||||
b.WriteString(strings.Repeat("─", w))
|
||||
b.WriteRune('\n')
|
||||
|
||||
// Caption centered under the chart.
|
||||
if caption != "" {
|
||||
total := labelWidth + 1 + w
|
||||
if pad := (total - len(caption)) / 2; pad > 0 {
|
||||
b.WriteString(strings.Repeat(" ", pad))
|
||||
}
|
||||
b.WriteString(caption)
|
||||
b.WriteRune('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
|
||||
v := make([]float64, len(rows))
|
||||
for i, r := range rows {
|
||||
v[i] = fn(r)
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
|
||||
func gpuDownsample(vals []float64, w int) []float64 {
|
||||
n := len(vals)
|
||||
if n == 0 {
|
||||
return make([]float64, w)
|
||||
}
|
||||
result := make([]float64, w)
|
||||
if n >= w {
|
||||
counts := make([]int, w)
|
||||
for i, v := range vals {
|
||||
bucket := i * w / n
|
||||
if bucket >= w {
|
||||
bucket = w - 1
|
||||
}
|
||||
result[bucket] += v
|
||||
counts[bucket]++
|
||||
}
|
||||
for i := range result {
|
||||
if counts[i] > 0 {
|
||||
result[i] /= float64(counts[i])
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Nearest-neighbour upsample.
|
||||
for i := range result {
|
||||
src := i * (n - 1) / (w - 1)
|
||||
if src >= n {
|
||||
src = n - 1
|
||||
}
|
||||
result[i] = vals[src]
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func gpuMinMax(vals []float64) (float64, float64) {
|
||||
if len(vals) == 0 {
|
||||
return 0, 1
|
||||
@@ -642,3 +491,46 @@ func gpuFormatTick(v float64) string {
|
||||
}
|
||||
return strconv.FormatFloat(v, 'f', 1, 64)
|
||||
}
|
||||
|
||||
var gpuMetricStagePalette = []string{
|
||||
"#d95c5c",
|
||||
"#2185d0",
|
||||
"#21ba45",
|
||||
"#f2c037",
|
||||
"#6435c9",
|
||||
"#00b5ad",
|
||||
"#a5673f",
|
||||
}
|
||||
|
||||
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
|
||||
var spans []gpuMetricStageSpan
|
||||
for _, row := range rows {
|
||||
name := strings.TrimSpace(row.Stage)
|
||||
if name == "" {
|
||||
name = "run"
|
||||
}
|
||||
if len(spans) == 0 || spans[len(spans)-1].Name != name {
|
||||
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
|
||||
continue
|
||||
}
|
||||
spans[len(spans)-1].End = row.ElapsedSec
|
||||
}
|
||||
for i := range spans {
|
||||
if spans[i].End <= spans[i].Start {
|
||||
spans[i].End = spans[i].Start + 1
|
||||
}
|
||||
}
|
||||
return spans
|
||||
}
|
||||
|
||||
var gpuHTMLReplacer = strings.NewReplacer(
|
||||
"&", "&",
|
||||
"<", "<",
|
||||
">", ">",
|
||||
`"`, """,
|
||||
"'", "'",
|
||||
)
|
||||
|
||||
func gpuHTMLEscape(s string) string {
|
||||
return gpuHTMLReplacer.Replace(s)
|
||||
}
|
||||
|
||||
65
audit/internal/platform/gpu_metrics_test.go
Normal file
65
audit/internal/platform/gpu_metrics_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.csv")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
|
||||
}
|
||||
if err := WriteGPUMetricsCSV(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsCSV: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage,elapsed_sec,gpu_index",
|
||||
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("csv missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "gpu-metrics.html")
|
||||
rows := []GPUMetricRow{
|
||||
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
|
||||
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
|
||||
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
|
||||
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
|
||||
}
|
||||
if err := WriteGPUMetricsHTML(path, rows); err != nil {
|
||||
t.Fatalf("WriteGPUMetricsHTML: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
for _, needle := range []string{
|
||||
"stage-legend",
|
||||
"baseline",
|
||||
"steady-fp16",
|
||||
"GPU Stress Test Metrics",
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("html missing %q\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,10 +11,10 @@ import (
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
func mountSource(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func mountFSType(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceType(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceTransport(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
switch {
|
||||
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||
return "ram"
|
||||
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||
return "cdrom"
|
||||
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||
return "usb"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||
return "cdrom"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||
return "disk"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
|
||||
@@ -12,11 +12,48 @@ import (
|
||||
)
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
// No medium mount at all — fall back to toram kernel parameter.
|
||||
return toramActive()
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||
if strings.EqualFold(fsType, "tmpfs") {
|
||||
return true
|
||||
}
|
||||
// When RunInstallToRAM copies squashfs to /dev/shm/bee-live but the bind
|
||||
// mount of /run/live/medium fails (common for CD-ROM boots), the medium
|
||||
// fstype still shows the CD-ROM type. Check whether the RAM copy exists.
|
||||
files, _ := filepath.Glob("/dev/shm/bee-live/*.squashfs")
|
||||
return len(files) > 0
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
source := mountSource("/run/live/medium")
|
||||
device := findLiveBootDevice()
|
||||
status := LiveBootSource{
|
||||
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||
Source: source,
|
||||
Device: device,
|
||||
}
|
||||
if fsType == "" && source == "" && device == "" {
|
||||
if toramActive() {
|
||||
status.InRAM = true
|
||||
status.Kind = "ram"
|
||||
status.Source = "tmpfs"
|
||||
return status
|
||||
}
|
||||
status.Kind = "unknown"
|
||||
return status
|
||||
}
|
||||
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||
if status.Kind == "" {
|
||||
status.Kind = "unknown"
|
||||
}
|
||||
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||
status.Source = "tmpfs"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
@@ -87,14 +124,71 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := exec.Command("mount", "--bind", dstDir, "/run/live/medium").Run(); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||
|
||||
mediumRebound := false
|
||||
if err := bindMount(dstDir, "/run/live/medium"); err != nil {
|
||||
log(fmt.Sprintf("Warning: rebind /run/live/medium → %s failed: %v", dstDir, err))
|
||||
} else {
|
||||
mediumRebound = true
|
||||
}
|
||||
|
||||
log("Done. Installation media can be safely disconnected.")
|
||||
log("Verifying live medium now served from RAM...")
|
||||
status := s.LiveBootSource()
|
||||
if err := verifyInstallToRAMStatus(status, dstDir, mediumRebound, log); err != nil {
|
||||
return err
|
||||
}
|
||||
if status.InRAM {
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
}
|
||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
}
|
||||
|
||||
// The live medium mount was not redirected to RAM. This is expected when
|
||||
// booting from an ISO/CD-ROM: the squashfs loop device has a non-zero
|
||||
// offset and LOOP_CHANGE_FD cannot be used; the bind mount also fails
|
||||
// because the CD-ROM mount is in use. Check whether files were at least
|
||||
// copied to the tmpfs directory — that is sufficient for safe disconnection
|
||||
// once the kernel has paged in all actively-used data.
|
||||
files, _ := filepath.Glob(filepath.Join(dstDir, "*.squashfs"))
|
||||
if len(files) > 0 {
|
||||
if !mediumRebound {
|
||||
log(fmt.Sprintf("Note: squashfs copied to RAM (%s) but /run/live/medium still shows the original source.", dstDir))
|
||||
log("This is normal for CD-ROM boots. For a fully transparent RAM boot, add 'toram' to the kernel parameters.")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s and no squashfs found in %s", describeLiveBootSource(status), dstDir)
|
||||
}
|
||||
|
||||
func describeLiveBootSource(status LiveBootSource) string {
|
||||
source := strings.TrimSpace(status.Device)
|
||||
if source == "" {
|
||||
source = strings.TrimSpace(status.Source)
|
||||
}
|
||||
if source == "" {
|
||||
source = "unknown source"
|
||||
}
|
||||
switch strings.TrimSpace(status.Kind) {
|
||||
case "ram":
|
||||
return "RAM"
|
||||
case "usb":
|
||||
return "USB (" + source + ")"
|
||||
case "cdrom":
|
||||
return "CD-ROM (" + source + ")"
|
||||
case "disk":
|
||||
return "disk (" + source + ")"
|
||||
default:
|
||||
return source
|
||||
}
|
||||
}
|
||||
|
||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
@@ -183,7 +277,31 @@ func findLoopForFile(backingFile string) (string, error) {
|
||||
return "", fmt.Errorf("no loop device found for %s", backingFile)
|
||||
}
|
||||
|
||||
// loopDeviceOffset returns the byte offset configured for the loop device,
|
||||
// or -1 if it cannot be determined.
|
||||
func loopDeviceOffset(loopDev string) int64 {
|
||||
out, err := exec.Command("losetup", "--json", loopDev).Output()
|
||||
if err != nil {
|
||||
return -1
|
||||
}
|
||||
var result struct {
|
||||
Loopdevices []struct {
|
||||
Offset int64 `json:"offset"`
|
||||
} `json:"loopdevices"`
|
||||
}
|
||||
if err := json.Unmarshal(out, &result); err != nil || len(result.Loopdevices) == 0 {
|
||||
return -1
|
||||
}
|
||||
return result.Loopdevices[0].Offset
|
||||
}
|
||||
|
||||
func reassociateLoopDevice(loopDev, newFile string) error {
|
||||
// LOOP_CHANGE_FD requires lo_offset == 0. ISO/CD-ROM loop devices are
|
||||
// typically set up with a non-zero offset (squashfs lives inside the ISO),
|
||||
// so the ioctl returns EINVAL. Detect this early for a clear error message.
|
||||
if off := loopDeviceOffset(loopDev); off > 0 {
|
||||
return fmt.Errorf("loop device has non-zero offset (%d bytes, typical for ISO/CD-ROM) — LOOP_CHANGE_FD not supported; use 'toram' kernel parameter for RAM boot", off)
|
||||
}
|
||||
if err := exec.Command("losetup", "--replace", loopDev, newFile).Run(); err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -26,3 +26,8 @@ func loopChangeFD(loopDev, newFile string) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// bindMount binds src over dst using the syscall directly (avoids exec PATH issues).
|
||||
func bindMount(src, dst string) error {
|
||||
return syscall.Mount(src, dst, "", syscall.MS_BIND, "")
|
||||
}
|
||||
|
||||
@@ -7,3 +7,7 @@ import "errors"
|
||||
func loopChangeFD(loopDev, newFile string) error {
|
||||
return errors.New("LOOP_CHANGE_FD not available on this platform")
|
||||
}
|
||||
|
||||
func bindMount(src, dst string) error {
|
||||
return errors.New("bind mount not available on this platform")
|
||||
}
|
||||
|
||||
60
audit/internal/platform/install_to_ram_test.go
Normal file
60
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
source string
|
||||
deviceType string
|
||||
transport string
|
||||
want string
|
||||
}{
|
||||
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dstDir := t.TempDir()
|
||||
|
||||
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}, dstDir, false, nil); err != nil {
|
||||
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||
}
|
||||
|
||||
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"}, dstDir, false, nil)
|
||||
if err == nil {
|
||||
t.Fatal("expected verification failure when media is still on USB")
|
||||
}
|
||||
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1) and no squashfs found in "+dstDir {
|
||||
t.Fatalf("error=%q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribeLiveBootSource(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
|
||||
t.Fatalf("got %q want RAM", got)
|
||||
}
|
||||
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
|
||||
t.Fatalf("got %q want /run/live/medium", got)
|
||||
}
|
||||
}
|
||||
68
audit/internal/platform/kill_workers.go
Normal file
68
audit/internal/platform/kill_workers.go
Normal file
@@ -0,0 +1,68 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
// bee test worker processes that should be killed by KillTestWorkers.
|
||||
var workerPatterns = []string{
|
||||
"bee-gpu-burn",
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
"dcgmi",
|
||||
}
|
||||
|
||||
// KilledProcess describes a process that was sent SIGKILL.
|
||||
type KilledProcess struct {
|
||||
PID int `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
pid, err := strconv.Atoi(e.Name())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// /proc/*/cmdline uses NUL bytes as argument separators.
|
||||
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
||||
exe := strings.TrimSpace(args[0])
|
||||
base := exe
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
@@ -68,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
|
||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||
// the overall CPU utilisation percentage.
|
||||
var cpuStatPrev [2]uint64 // [total, idle]
|
||||
|
||||
func sampleCPULoadPct() float64 {
|
||||
total, idle := readCPUStat()
|
||||
if total == 0 {
|
||||
total0, idle0 := readCPUStat()
|
||||
if total0 == 0 {
|
||||
return 0
|
||||
}
|
||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
||||
cpuStatPrev = [2]uint64{total, idle}
|
||||
if prevTotal == 0 {
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
total1, idle1 := readCPUStat()
|
||||
if total1 == 0 {
|
||||
return 0
|
||||
}
|
||||
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||
}
|
||||
|
||||
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||
dt := float64(total - prevTotal)
|
||||
di := float64(idle - prevIdle)
|
||||
if dt <= 0 {
|
||||
|
||||
@@ -42,3 +42,53 @@ func TestCompactAmbientTempName(t *testing.T) {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPULoadPctBetween(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
prevTotal uint64
|
||||
prevIdle uint64
|
||||
total uint64
|
||||
idle uint64
|
||||
want float64
|
||||
}{
|
||||
{
|
||||
name: "busy half",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 90,
|
||||
want: 50,
|
||||
},
|
||||
{
|
||||
name: "fully busy",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 40,
|
||||
want: 100,
|
||||
},
|
||||
{
|
||||
name: "no progress",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 100,
|
||||
idle: 40,
|
||||
want: 0,
|
||||
},
|
||||
{
|
||||
name: "idle delta larger than total clamps to zero",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 150,
|
||||
want: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -95,9 +101,7 @@ func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
|
||||
@@ -26,7 +26,8 @@ type PlatformStressCycle struct {
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
Cycles []PlatformStressCycle
|
||||
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
@@ -68,8 +69,11 @@ func (s *System) RunPlatformStress(
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
@@ -88,27 +92,31 @@ func (s *System) RunPlatformStress(
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
if hasCPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
}
|
||||
|
||||
// GPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
if hasGPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
}
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
@@ -153,13 +161,7 @@ func (s *System) RunPlatformStress(
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
@@ -384,6 +386,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||
@@ -394,28 +403,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx)
|
||||
return buildAMDGPUStressCmd(ctx, durSec)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx)
|
||||
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := `actions:
|
||||
cfg := fmt.Sprintf(`actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: 86400000
|
||||
duration: %d`, durSec*1000) + `
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
@@ -425,13 +434,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
@@ -439,7 +455,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||
// where the context is cancelled early (user stop, parent timeout).
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
@@ -486,6 +512,15 @@ func platformStressMemoryMB() int {
|
||||
return mb
|
||||
}
|
||||
|
||||
func containsComponent(components []string, name string) bool {
|
||||
for _, c := range components {
|
||||
if c == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
@@ -114,6 +115,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
}
|
||||
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
s.collectToRAMHealth(&health)
|
||||
s.collectUSBExportHealth(&health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
@@ -135,12 +138,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
case "nvidia":
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"dcgmi",
|
||||
"nv-hostengine",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
"bee-nccl-gpu-stress",
|
||||
"all_reduce_perf",
|
||||
})...)
|
||||
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||
@@ -155,11 +161,127 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
return tools
|
||||
}
|
||||
|
||||
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||
for _, candidate := range candidates {
|
||||
path, err := exec.LookPath(candidate)
|
||||
if err == nil {
|
||||
return ToolStatus{Name: display, Path: path, OK: true}
|
||||
}
|
||||
}
|
||||
return ToolStatus{Name: display}
|
||||
}
|
||||
|
||||
// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
|
||||
// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
|
||||
// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
|
||||
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||
inRAM := s.IsLiveMediaInRAM()
|
||||
active := toramActive()
|
||||
switch {
|
||||
case inRAM:
|
||||
health.ToRAMStatus = "ok"
|
||||
case active:
|
||||
// toram was requested but medium is not yet/no longer in RAM
|
||||
health.ToRAMStatus = "failed"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_failed",
|
||||
Severity: "warning",
|
||||
Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
|
||||
})
|
||||
default:
|
||||
health.ToRAMStatus = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
||||
// suitable for log export. Sets USBExportPath to the first match found.
|
||||
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
||||
health.USBExportPath = findUSBExportMount()
|
||||
}
|
||||
|
||||
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
||||
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
||||
// has USB transport. Returns "" if none found.
|
||||
func findUSBExportMount() string {
|
||||
f, err := os.Open("/proc/mounts")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// fs types that are expected on USB export drives
|
||||
exportFSTypes := map[string]bool{
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"fuseblk": true,
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
// fields: device mountpoint fstype options dump pass
|
||||
fields := strings.Fields(scanner.Text())
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||
if !exportFSTypes[strings.ToLower(fsType)] {
|
||||
continue
|
||||
}
|
||||
// Skip read-only mounts
|
||||
opts := strings.Split(options, ",")
|
||||
readOnly := false
|
||||
for _, o := range opts {
|
||||
if strings.TrimSpace(o) == "ro" {
|
||||
readOnly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if readOnly {
|
||||
continue
|
||||
}
|
||||
// Check USB transport via lsblk on the device (or its parent disk for partitions).
|
||||
if !strings.HasPrefix(device, "/dev/") {
|
||||
continue
|
||||
}
|
||||
checkDev := device
|
||||
// lsblk only reports TRAN for the whole disk, not for partitions (e.g. /dev/sdc1).
|
||||
// Strip trailing partition digits to get the parent disk name.
|
||||
if trimmed := strings.TrimRight(device, "0123456789"); trimmed != device && len(trimmed) > len("/dev/") {
|
||||
checkDev = trimmed
|
||||
}
|
||||
if blockDeviceTransport(checkDev) == "usb" {
|
||||
return mountPoint
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_stuck",
|
||||
Severity: "critical",
|
||||
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||
})
|
||||
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_disabled",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||
})
|
||||
}
|
||||
}
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
|
||||
@@ -16,14 +16,16 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
satFreeMemBytes = freeMemBytes
|
||||
|
||||
rocmSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
@@ -37,6 +39,12 @@ var (
|
||||
"/opt/rocm/bin/rvs",
|
||||
"/opt/rocm-*/bin/rvs",
|
||||
}
|
||||
dcgmProfTesterCandidates = []string{
|
||||
"dcgmproftester",
|
||||
"dcgmproftester13",
|
||||
"dcgmproftester12",
|
||||
"dcgmproftester11",
|
||||
}
|
||||
)
|
||||
|
||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||
@@ -75,15 +83,46 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
||||
|
||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||
type NvidiaGPU struct {
|
||||
Index int
|
||||
Name string
|
||||
MemoryMB int
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
MemoryMB int `json:"memory_mb"`
|
||||
}
|
||||
|
||||
type NvidiaGPUStatus struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
BDF string `json:"bdf,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Status string `json:"status"`
|
||||
RawLine string `json:"raw_line,omitempty"`
|
||||
NeedsReset bool `json:"needs_reset"`
|
||||
ParseFailure bool `json:"parse_failure,omitempty"`
|
||||
}
|
||||
|
||||
type nvidiaGPUHealth struct {
|
||||
Index int
|
||||
Name string
|
||||
NeedsReset bool
|
||||
RawLine string
|
||||
ParseFailure bool
|
||||
}
|
||||
|
||||
type nvidiaGPUStatusFile struct {
|
||||
Index int
|
||||
Name string
|
||||
RunStatus string
|
||||
Reason string
|
||||
Health string
|
||||
HealthRaw string
|
||||
Observed bool
|
||||
Selected bool
|
||||
FailingJob string
|
||||
}
|
||||
|
||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||
type AMDGPUInfo struct {
|
||||
Index int
|
||||
Name string
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||
@@ -255,9 +294,78 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
MemoryMB: memMB,
|
||||
})
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool {
|
||||
return gpus[i].Index < gpus[j].Index
|
||||
})
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func (s *System) ListNvidiaGPUStatuses() ([]NvidiaGPUStatus, error) {
|
||||
out, err := satExecCommand(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,pci.bus_id,serial,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var gpus []NvidiaGPUStatus
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 4 {
|
||||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
gpus = append(gpus, NvidiaGPUStatus{RawLine: line, Status: "UNKNOWN", ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
upper := strings.ToUpper(line)
|
||||
needsReset := strings.Contains(upper, "GPU REQUIRES RESET")
|
||||
status := "OK"
|
||||
if needsReset {
|
||||
status = "RESET_REQUIRED"
|
||||
}
|
||||
gpus = append(gpus, NvidiaGPUStatus{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
BDF: normalizeNvidiaBusID(strings.TrimSpace(parts[2])),
|
||||
Serial: strings.TrimSpace(parts[3]),
|
||||
Status: status,
|
||||
RawLine: line,
|
||||
NeedsReset: needsReset,
|
||||
})
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool { return gpus[i].Index < gpus[j].Index })
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
func normalizeNvidiaBusID(v string) string {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
parts := strings.Split(v, ":")
|
||||
if len(parts) == 3 && len(parts[0]) > 4 {
|
||||
parts[0] = parts[0][len(parts[0])-4:]
|
||||
return strings.Join(parts, ":")
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
raw, err := satExecCommand("nvidia-smi", "-r", "-i", strconv.Itoa(index)).CombinedOutput()
|
||||
if len(raw) == 0 && err == nil {
|
||||
raw = []byte("GPU reset completed.\n")
|
||||
}
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across all NVIDIA GPUs.
|
||||
// Measures collective communication bandwidth over NVLink/PCIe.
|
||||
func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
@@ -267,13 +375,101 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, logFunc)
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var (
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if staggerSec > 0 && len(selected) > 1 {
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: profEnv,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-power.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-pulse-test.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-nvbandwidth.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
@@ -285,12 +481,77 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
|
||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||
// ctx cancellation kills the running job.
|
||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
||||
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-stress.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
if len(gpuIndices) > 0 {
|
||||
return dedupeSortedIndices(gpuIndices), nil
|
||||
}
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
return all, nil
|
||||
}
|
||||
|
||||
func memoryStressSizeArg() string {
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
return fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
availBytes := satFreeMemBytes()
|
||||
if availBytes <= 0 {
|
||||
return "80%"
|
||||
}
|
||||
availMB := availBytes / (1024 * 1024)
|
||||
targetMB := (availMB * 2) / 3
|
||||
if targetMB >= 256 {
|
||||
targetMB = (targetMB / 256) * 256
|
||||
}
|
||||
if targetMB <= 0 {
|
||||
return "80%"
|
||||
}
|
||||
return fmt.Sprintf("%dM", targetMB)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if sizeMB <= 0 {
|
||||
sizeMB = 256
|
||||
}
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
@@ -303,11 +564,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||
}
|
||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||
sizeArg := "80%"
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
sizeArg = fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
// Base the default on current MemAvailable and keep headroom for the OS and
|
||||
// concurrent stressors so mixed burn runs do not trip the OOM killer.
|
||||
sizeArg := memoryStressSizeArg()
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||
@@ -349,7 +608,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -381,7 +640,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
||||
break
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath)
|
||||
commands := storageSATCommands(devPath, extended)
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
@@ -403,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
type satJob struct {
|
||||
@@ -424,14 +679,24 @@ type satStats struct {
|
||||
Unsupported int
|
||||
}
|
||||
|
||||
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||
out := make([]satJob, 0, len(jobs)+1)
|
||||
out = append(out, satJob{
|
||||
name: "00-nvidia-smi-persistence-mode.log",
|
||||
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||
})
|
||||
out = append(out, jobs...)
|
||||
return out
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
@@ -446,11 +711,39 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
}
|
||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs, gpuIndices: gpuIndices},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||
args := []string{"dcgmi", "diag", "-r", name}
|
||||
if durationSec > 0 {
|
||||
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
args = append(args, "-i", joinIndexList(gpuIndices))
|
||||
}
|
||||
return args
|
||||
}
|
||||
|
||||
func normalizeNvidiaBurnDuration(durationSec int) int {
|
||||
if durationSec <= 0 {
|
||||
return 300
|
||||
}
|
||||
return durationSec
|
||||
}
|
||||
|
||||
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
}
|
||||
return []string{
|
||||
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -470,11 +763,23 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
|
||||
var summary strings.Builder
|
||||
stats := satStats{}
|
||||
nvidiaPack := strings.HasPrefix(prefix, "gpu-nvidia")
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{}
|
||||
selectedGPUIndices := map[int]struct{}{}
|
||||
fmt.Fprintf(&summary, "run_at_utc=%s\n", time.Now().UTC().Format(time.RFC3339))
|
||||
for _, job := range jobs {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
for _, idx := range job.gpuIndices {
|
||||
selectedGPUIndices[idx] = struct{}{}
|
||||
status := perGPU[idx]
|
||||
if status == nil {
|
||||
status = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = status
|
||||
}
|
||||
status.Selected = true
|
||||
}
|
||||
cmd := make([]string, 0, len(job.cmd))
|
||||
for _, arg := range job.cmd {
|
||||
cmd = append(cmd, strings.ReplaceAll(arg, "{{run_dir}}", runDir))
|
||||
@@ -483,17 +788,52 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
var out []byte
|
||||
var err error
|
||||
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
out = []byte(msg + "\n")
|
||||
err = healthErr
|
||||
}
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
if job.collectGPU {
|
||||
out, err = runSATCommandWithMetrics(ctx, verboseLog, job.name, cmd, job.env, job.gpuIndices, runDir, logFunc)
|
||||
} else {
|
||||
out, err = runSATCommandCtx(ctx, verboseLog, job.name, cmd, job.env, logFunc)
|
||||
}
|
||||
}
|
||||
|
||||
if nvidiaPack && nvidiaJobNeedsHealthCheck(job) {
|
||||
if msg, healthErr := checkNvidiaJobHealth(job.gpuIndices); healthErr != nil {
|
||||
if logFunc != nil {
|
||||
logFunc(msg)
|
||||
}
|
||||
if len(out) > 0 && !bytes.HasSuffix(out, []byte("\n")) {
|
||||
out = append(out, '\n')
|
||||
}
|
||||
out = append(out, []byte(msg+"\n")...)
|
||||
if err == nil {
|
||||
err = healthErr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
return "", ctx.Err()
|
||||
}
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
if nvidiaPack && len(job.gpuIndices) > 0 && nvidiaJobNeedsHealthCheck(job) {
|
||||
for _, idx := range job.gpuIndices {
|
||||
updateNvidiaGPUStatus(perGPU, idx, status, job.name, string(out))
|
||||
}
|
||||
}
|
||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
@@ -502,12 +842,204 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
if nvidiaPack {
|
||||
if err := writeNvidiaGPUStatusFiles(runDir, stats.Overall(), perGPU, selectedGPUIndices); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
return archive, nil
|
||||
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||
entry := perGPU[idx]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = entry
|
||||
}
|
||||
if nvidiaSATStatusSeverity(status) >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||
entry.RunStatus = status
|
||||
entry.FailingJob = jobName
|
||||
entry.Reason = firstLine(detail)
|
||||
}
|
||||
}
|
||||
|
||||
func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPUStatusFile, selected map[int]struct{}) error {
|
||||
health, err := readNvidiaGPUHealth()
|
||||
if err == nil {
|
||||
for _, gpu := range health {
|
||||
entry := perGPU[gpu.Index]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: gpu.Index}
|
||||
perGPU[gpu.Index] = entry
|
||||
}
|
||||
entry.Name = gpu.Name
|
||||
entry.Observed = true
|
||||
entry.HealthRaw = gpu.RawLine
|
||||
if gpu.NeedsReset {
|
||||
entry.Health = "RESET_REQUIRED"
|
||||
if entry.RunStatus == "" || nvidiaSATStatusSeverity("FAILED") >= nvidiaSATStatusSeverity(entry.RunStatus) {
|
||||
entry.RunStatus = "FAILED"
|
||||
if strings.TrimSpace(entry.Reason) == "" {
|
||||
entry.Reason = "GPU requires reset"
|
||||
}
|
||||
}
|
||||
} else {
|
||||
entry.Health = "OK"
|
||||
}
|
||||
}
|
||||
}
|
||||
for idx := range selected {
|
||||
entry := perGPU[idx]
|
||||
if entry == nil {
|
||||
entry = &nvidiaGPUStatusFile{Index: idx}
|
||||
perGPU[idx] = entry
|
||||
}
|
||||
entry.Selected = true
|
||||
}
|
||||
var indices []int
|
||||
for idx := range perGPU {
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
sort.Ints(indices)
|
||||
for _, idx := range indices {
|
||||
entry := perGPU[idx]
|
||||
if entry.RunStatus == "" {
|
||||
entry.RunStatus = overall
|
||||
}
|
||||
if entry.Health == "" {
|
||||
entry.Health = "UNKNOWN"
|
||||
}
|
||||
if entry.Name == "" {
|
||||
entry.Name = "Unknown GPU"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||
fmt.Fprintf(&body, "gpu_name=%s\n", entry.Name)
|
||||
fmt.Fprintf(&body, "selected=%t\n", entry.Selected)
|
||||
fmt.Fprintf(&body, "observed=%t\n", entry.Observed)
|
||||
fmt.Fprintf(&body, "run_status=%s\n", entry.RunStatus)
|
||||
fmt.Fprintf(&body, "health_status=%s\n", entry.Health)
|
||||
if strings.TrimSpace(entry.FailingJob) != "" {
|
||||
fmt.Fprintf(&body, "failing_job=%s\n", entry.FailingJob)
|
||||
}
|
||||
if strings.TrimSpace(entry.Reason) != "" {
|
||||
fmt.Fprintf(&body, "reason=%s\n", entry.Reason)
|
||||
}
|
||||
if strings.TrimSpace(entry.HealthRaw) != "" {
|
||||
fmt.Fprintf(&body, "health_raw=%s\n", entry.HealthRaw)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-status.txt", idx)), []byte(body.String()), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func nvidiaSATStatusSeverity(status string) int {
|
||||
switch strings.ToUpper(strings.TrimSpace(status)) {
|
||||
case "FAILED":
|
||||
return 3
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func firstLine(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
if idx := strings.IndexByte(s, '\n'); idx >= 0 {
|
||||
return strings.TrimSpace(s[:idx])
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func nvidiaJobNeedsHealthCheck(job satJob) bool {
|
||||
if job.collectGPU {
|
||||
return true
|
||||
}
|
||||
name := strings.ToLower(strings.TrimSpace(job.name))
|
||||
return strings.Contains(name, "dcgmi") ||
|
||||
strings.Contains(name, "gpu-burn") ||
|
||||
strings.Contains(name, "gpu-stress") ||
|
||||
strings.Contains(name, "dcgmproftester")
|
||||
}
|
||||
|
||||
func checkNvidiaJobHealth(selected []int) (string, error) {
|
||||
health, err := readNvidiaGPUHealth()
|
||||
if err != nil {
|
||||
return "", nil
|
||||
}
|
||||
var bad []nvidiaGPUHealth
|
||||
selectedSet := make(map[int]struct{}, len(selected))
|
||||
for _, idx := range selected {
|
||||
selectedSet[idx] = struct{}{}
|
||||
}
|
||||
for _, gpu := range health {
|
||||
if len(selectedSet) > 0 {
|
||||
if _, ok := selectedSet[gpu.Index]; !ok {
|
||||
continue
|
||||
}
|
||||
}
|
||||
if gpu.NeedsReset {
|
||||
bad = append(bad, gpu)
|
||||
}
|
||||
}
|
||||
if len(bad) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
lines := make([]string, 0, len(bad)+1)
|
||||
lines = append(lines, "NVIDIA GPU health check failed:")
|
||||
for _, gpu := range bad {
|
||||
lines = append(lines, fmt.Sprintf("gpu %d (%s) requires reset: %s", gpu.Index, gpu.Name, gpu.RawLine))
|
||||
}
|
||||
return strings.Join(lines, "\n"), errors.New("nvidia gpu requires reset")
|
||||
}
|
||||
|
||||
func readNvidiaGPUHealth() ([]nvidiaGPUHealth, error) {
|
||||
out, err := satExecCommand(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
return parseNvidiaGPUHealth(string(out)), nil
|
||||
}
|
||||
|
||||
func parseNvidiaGPUHealth(raw string) []nvidiaGPUHealth {
|
||||
var gpus []nvidiaGPUHealth
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 2 {
|
||||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
if err != nil {
|
||||
gpus = append(gpus, nvidiaGPUHealth{RawLine: line, ParseFailure: true})
|
||||
continue
|
||||
}
|
||||
upper := strings.ToUpper(line)
|
||||
gpus = append(gpus, nvidiaGPUHealth{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
NeedsReset: strings.Contains(upper, "GPU REQUIRES RESET"),
|
||||
RawLine: line,
|
||||
})
|
||||
}
|
||||
return gpus
|
||||
}
|
||||
|
||||
func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string, env []string, logFunc func(string)) ([]byte, error) {
|
||||
@@ -531,6 +1063,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
c.Cancel = func() error {
|
||||
if c.Process != nil {
|
||||
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
@@ -557,17 +1096,25 @@ func listStorageDevices() ([]string, error) {
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||
selfTestLevel := "1"
|
||||
if extended {
|
||||
selfTestLevel = "2"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||
}
|
||||
}
|
||||
smartTestType := "short"
|
||||
if extended {
|
||||
smartTestType = "long"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -616,6 +1163,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
}
|
||||
if strings.Contains(text, "unsupported") ||
|
||||
strings.Contains(text, "not supported") ||
|
||||
strings.Contains(text, "not found in path") ||
|
||||
strings.Contains(text, "invalid opcode") ||
|
||||
strings.Contains(text, "unknown command") ||
|
||||
strings.Contains(text, "not implemented") ||
|
||||
@@ -625,6 +1173,11 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
||||
// nvidia-smi on a machine with no NVIDIA GPU
|
||||
strings.Contains(text, "couldn't communicate with the nvidia driver") ||
|
||||
strings.Contains(text, "no nvidia gpu") ||
|
||||
// Some NVMe firmwares start self-test but never expose progress to nvme-cli
|
||||
// while waiting, so the CLI stops polling without proving device failure.
|
||||
(strings.Contains(name, "self-test") &&
|
||||
strings.Contains(text, "no progress for") &&
|
||||
strings.Contains(text, "stop waiting")) ||
|
||||
(strings.Contains(name, "self-test") && strings.Contains(text, "aborted")) {
|
||||
return "UNSUPPORTED", rc
|
||||
}
|
||||
@@ -722,6 +1275,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||
}
|
||||
|
||||
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
|
||||
for _, candidate := range dcgmProfTesterCandidates {
|
||||
if path, err := satLookPath(candidate); err == nil {
|
||||
return append([]string{path}, args...), nil
|
||||
}
|
||||
}
|
||||
return nil, errors.New("dcgmproftester not found in PATH")
|
||||
}
|
||||
|
||||
func ensureAMDRuntimeReady() error {
|
||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||
return nil
|
||||
@@ -820,8 +1382,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
|
||||
if len(metricRows) > 0 {
|
||||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
|
||||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
|
||||
chart := RenderGPUTerminalChart(metricRows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
|
||||
}
|
||||
|
||||
return out, err
|
||||
|
||||
@@ -20,7 +20,7 @@ type FanStressOptions struct {
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
@@ -51,6 +51,18 @@ type FanStressRow struct {
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
@@ -211,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
@@ -231,9 +239,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
@@ -419,6 +426,101 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
|
||||
return fans, nil
|
||||
}
|
||||
|
||||
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
|
||||
// Returns the average duty cycle across all exposed PWM controls.
|
||||
func sampleFanDutyCyclePct() (float64, bool) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return parseFanDutyCyclePctSensorsJSON(out)
|
||||
}
|
||||
|
||||
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
|
||||
var doc map[string]map[string]any
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
return 0, false
|
||||
}
|
||||
var samples []float64
|
||||
for _, features := range doc {
|
||||
for name, feature := range features {
|
||||
if strings.EqualFold(name, "Adapter") {
|
||||
continue
|
||||
}
|
||||
featureMap, ok := feature.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if duty, ok := firstFanDutyValue(name, featureMap); ok {
|
||||
samples = append(samples, duty)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return benchmarkMean(samples), true
|
||||
}
|
||||
|
||||
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
|
||||
featureName = strings.ToLower(strings.TrimSpace(featureName))
|
||||
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
|
||||
return 0, false
|
||||
}
|
||||
if strings.Contains(featureName, "pwm") {
|
||||
for _, key := range []string{"input", "value", "current"} {
|
||||
if value, ok := feature[key]; ok {
|
||||
if duty, parsed := parseFanDutyValue(value); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if !strings.Contains(lower, "pwm") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
|
||||
continue
|
||||
}
|
||||
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
|
||||
return duty, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func parseFanDutyValue(value any) (float64, bool) {
|
||||
switch v := value.(type) {
|
||||
case float64:
|
||||
return normalizePWMAsDutyPct(v)
|
||||
case string:
|
||||
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
|
||||
return normalizePWMAsDutyPct(f)
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
|
||||
if raw < 0 {
|
||||
return 0, false
|
||||
}
|
||||
if raw <= 100 {
|
||||
return raw, true
|
||||
}
|
||||
if raw <= 255 {
|
||||
return raw / 255.0 * 100.0, true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFanInputValue(feature map[string]any) (float64, bool) {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
@@ -508,11 +610,17 @@ func sampleCPUTempViaSensors() float64 {
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
now := time.Now()
|
||||
current := 0.0
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
if err == nil {
|
||||
current = parseDCMIPowerReading(string(out))
|
||||
}
|
||||
return parseDCMIPowerReading(string(out))
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||
systemPowerCache = updated
|
||||
return value
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
@@ -535,6 +643,17 @@ func parseDCMIPowerReading(raw string) float64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
return cache.Value, cache
|
||||
}
|
||||
return 0, cache
|
||||
}
|
||||
|
||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||
// during either load phase.
|
||||
func analyzeThrottling(rows []FanStressRow) bool {
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParseFanSpeeds(t *testing.T) {
|
||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||
@@ -25,3 +28,61 @@ func TestFirstFanInputValue(t *testing.T) {
|
||||
t.Fatalf("got=%v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
|
||||
raw := []byte(`{
|
||||
"chip0": {
|
||||
"fan1": {"input": 9000},
|
||||
"pwm1": {"input": 128},
|
||||
"pwm1_enable": {"input": 1}
|
||||
},
|
||||
"chip1": {
|
||||
"pwm2": {"input": 64}
|
||||
}
|
||||
}`)
|
||||
|
||||
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
|
||||
if !ok {
|
||||
t.Fatalf("expected duty cycle telemetry to be parsed")
|
||||
}
|
||||
if got < 57 || got > 58 {
|
||||
t.Fatalf("got=%v want ~57.1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
Minimum during sampling period: 498 Watts
|
||||
`
|
||||
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
if updated.Value != 480 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
if updated.Value != 530 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,23 +1,25 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1")
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda")
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
}
|
||||
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
|
||||
jobs := nvidiaSATJobs()
|
||||
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
if len(jobs) != 6 {
|
||||
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
}
|
||||
}
|
||||
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
got := jobs[5].cmd
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -162,6 +183,157 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
got, err := resolveDCGMGPUIndices(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "0,1,2"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||
}
|
||||
if want := "1,3"; joinIndexList(got) != want {
|
||||
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNvidiaGPUHealthDetectsResetRequired(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNvidiaGPUHealth("0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n")
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len=%d want 2", len(got))
|
||||
}
|
||||
if got[0].NeedsReset {
|
||||
t.Fatalf("gpu0 unexpectedly marked reset-required")
|
||||
}
|
||||
if !got[1].NeedsReset {
|
||||
t.Fatalf("gpu1 should be marked reset-required: %#v", got[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNvidiaJobHealthReturnsErrorForSelectedResetRequiredGPU(t *testing.T) {
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
msg, err := checkNvidiaJobHealth([]int{1})
|
||||
if err == nil {
|
||||
t.Fatal("expected health check error")
|
||||
}
|
||||
if !strings.Contains(msg, "gpu 1") || !strings.Contains(strings.ToLower(msg), "requires reset") {
|
||||
t.Fatalf("unexpected message: %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteNvidiaGPUStatusFilesCreatesPerGPUFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0, NVIDIA H100 PCIe, 38, 46.89, 0, 0, 81559\n1, NVIDIA H100 PCIe, [GPU requires reset], [N/A], [N/A], 0, 81559\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
perGPU := map[int]*nvidiaGPUStatusFile{
|
||||
0: {Index: 0, RunStatus: "OK"},
|
||||
1: {Index: 1, RunStatus: "FAILED", FailingJob: "02-dcgmi-targeted-stress.log", Reason: "NVIDIA GPU health check failed:"},
|
||||
}
|
||||
if err := writeNvidiaGPUStatusFiles(dir, "FAILED", perGPU, map[int]struct{}{0: {}, 1: {}}); err != nil {
|
||||
t.Fatalf("writeNvidiaGPUStatusFiles error: %v", err)
|
||||
}
|
||||
raw, err := os.ReadFile(filepath.Join(dir, "gpu-1-status.txt"))
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile gpu-1-status.txt: %v", err)
|
||||
}
|
||||
text := string(raw)
|
||||
if !strings.Contains(text, "run_status=FAILED") {
|
||||
t.Fatalf("missing run status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "health_status=RESET_REQUIRED") {
|
||||
t.Fatalf("missing health status:\n%s", text)
|
||||
}
|
||||
if !strings.Contains(text, "failing_job=02-dcgmi-targeted-stress.log") {
|
||||
t.Fatalf("missing failing job:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
switch file {
|
||||
case "dcgmproftester13":
|
||||
return "/usr/bin/dcgmproftester13", nil
|
||||
default:
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
|
||||
if err != nil {
|
||||
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 4 {
|
||||
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/dcgmproftester13" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
|
||||
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
|
||||
if len(cmd) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||
}
|
||||
for i := range want {
|
||||
if cmd[i] != want[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 2 {
|
||||
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||
}
|
||||
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||
}
|
||||
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -196,6 +368,37 @@ func TestEnvIntFallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "65536M" {
|
||||
t.Fatalf("sizeArg=%q want 65536M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
|
||||
|
||||
if got := memoryStressSizeArg(); got != "4096M" {
|
||||
t.Fatalf("sizeArg=%q want 4096M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 0 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "80%" {
|
||||
t.Fatalf("sizeArg=%q want 80%%", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifySATResult(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -206,6 +409,7 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "nvme wait timeout without progress", job: "nvme-device-self-test", out: "Short Device self-test started\nWaiting for self test completion...\nno progress for 78 seconds, stop waiting", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
@@ -220,6 +424,38 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
cancel()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||
}, nil)
|
||||
<-done
|
||||
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Fatalf("err=%v want context.Canceled", err)
|
||||
}
|
||||
if archive != "" {
|
||||
t.Fatalf("archive=%q want empty", archive)
|
||||
}
|
||||
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||
if globErr != nil {
|
||||
t.Fatalf("Glob error: %v", globErr)
|
||||
}
|
||||
if len(matches) != 0 {
|
||||
t.Fatalf("archives=%v want none", matches)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -10,17 +10,30 @@ import (
|
||||
func (s *System) ListBeeServices() ([]string, error) {
|
||||
seen := map[string]bool{}
|
||||
var out []string
|
||||
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
||||
for _, pattern := range []string{
|
||||
"/etc/systemd/system/bee-*.service",
|
||||
"/lib/systemd/system/bee-*.service",
|
||||
"/etc/systemd/system/bee-*.timer",
|
||||
"/lib/systemd/system/bee-*.timer",
|
||||
} {
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, match := range matches {
|
||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
||||
base := filepath.Base(match)
|
||||
name := base
|
||||
if strings.HasSuffix(base, ".service") {
|
||||
name = strings.TrimSuffix(base, ".service")
|
||||
}
|
||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||
if strings.HasSuffix(name, "@") {
|
||||
continue
|
||||
}
|
||||
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
|
||||
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
|
||||
continue
|
||||
}
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
out = append(out, name)
|
||||
@@ -48,7 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
|
||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
|
||||
@@ -2,6 +2,13 @@ package platform
|
||||
|
||||
type System struct{}
|
||||
|
||||
type LiveBootSource struct {
|
||||
InRAM bool `json:"in_ram"`
|
||||
Kind string `json:"kind"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
@@ -37,12 +44,12 @@ type StaticIPv4Config struct {
|
||||
}
|
||||
|
||||
type RemovableTarget struct {
|
||||
Device string
|
||||
FSType string
|
||||
Size string
|
||||
Label string
|
||||
Model string
|
||||
Mountpoint string
|
||||
Device string `json:"device"`
|
||||
FSType string `json:"fs_type"`
|
||||
Size string `json:"size"`
|
||||
Label string `json:"label"`
|
||||
Model string `json:"model"`
|
||||
Mountpoint string `json:"mountpoint"`
|
||||
}
|
||||
|
||||
type ToolStatus struct {
|
||||
@@ -63,6 +70,7 @@ type NvidiaStressOptions struct {
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
StaggerSeconds int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
|
||||
31
audit/internal/platform/types_test.go
Normal file
31
audit/internal/platform/types_test.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
data, err := json.Marshal(RemovableTarget{
|
||||
Device: "/dev/sdb1",
|
||||
FSType: "exfat",
|
||||
Size: "1.8T",
|
||||
Label: "USB",
|
||||
Model: "Flash",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
raw := string(data)
|
||||
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
|
||||
if !strings.Contains(raw, key) {
|
||||
t.Fatalf("json missing key %s: %s", key, raw)
|
||||
}
|
||||
}
|
||||
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
|
||||
t.Fatalf("json still contains Go field names: %s", raw)
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,12 @@ type RuntimeHealth struct {
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
@@ -182,6 +187,13 @@ type HardwarePCIeDevice struct {
|
||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,37 +1,15 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||
t.Setenv("DISPLAY", "")
|
||||
t.Setenv("XAUTHORITY", "")
|
||||
|
||||
cmd := xrandrCommand("--query")
|
||||
|
||||
var hasDisplay bool
|
||||
var hasXAuthority bool
|
||||
for _, kv := range cmd.Env {
|
||||
if kv == "DISPLAY=:0" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||
}
|
||||
if !hasXAuthority {
|
||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
@@ -61,4 +39,211 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
task := globalQueue.tasks[0]
|
||||
if task.Target != "nvidia-benchmark" {
|
||||
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
||||
}
|
||||
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||
}
|
||||
if task.params.RunNCCL {
|
||||
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||
}
|
||||
if task.Priority != taskPriorityBenchmark {
|
||||
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var resp taskRunResponse
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if len(resp.TaskIDs) != 2 {
|
||||
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
|
||||
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDefaultTaskPriorityOrder(t *testing.T) {
|
||||
got := []int{
|
||||
defaultTaskPriority("install-to-ram", taskParams{}),
|
||||
defaultTaskPriority("audit", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{}),
|
||||
defaultTaskPriority("cpu", taskParams{StressMode: true}),
|
||||
defaultTaskPriority("nvidia-stress", taskParams{}),
|
||||
defaultTaskPriority("nvidia-benchmark", taskParams{}),
|
||||
}
|
||||
want := []int{
|
||||
taskPriorityInstallToRAM,
|
||||
taskPriorityAudit,
|
||||
taskPriorityValidate,
|
||||
taskPriorityValidateStress,
|
||||
taskPriorityBurn,
|
||||
taskPriorityBenchmark,
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) {
|
||||
t.Fatalf("priority order=%v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_A", RPM: 4200},
|
||||
{Name: "FAN_B", RPM: 5100},
|
||||
})
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_B", RPM: 5200},
|
||||
})
|
||||
|
||||
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||
t.Fatalf("fanNames=%v", h.fanNames)
|
||||
}
|
||||
aVals, _ := h.ringFans[0].snapshot()
|
||||
bVals, _ := h.ringFans[1].snapshot()
|
||||
if len(aVals) != 2 || len(bVals) != 2 {
|
||||
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||
}
|
||||
if aVals[1] != 4200 {
|
||||
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||
}
|
||||
if bVals[1] != 5200 {
|
||||
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||
}
|
||||
}
|
||||
|
||||
871
audit/internal/webui/charts_svg.go
Normal file
871
audit/internal/webui/charts_svg.go
Normal file
@@ -0,0 +1,871 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
type chartTimelineSegment struct {
|
||||
Start time.Time
|
||||
End time.Time
|
||||
Active bool
|
||||
}
|
||||
|
||||
type chartScale struct {
|
||||
Min float64
|
||||
Max float64
|
||||
Ticks []float64
|
||||
}
|
||||
|
||||
type chartLayout struct {
|
||||
Width int
|
||||
Height int
|
||||
PlotLeft int
|
||||
PlotRight int
|
||||
PlotTop int
|
||||
PlotBottom int
|
||||
}
|
||||
|
||||
type metricChartSeries struct {
|
||||
Name string
|
||||
AxisTitle string
|
||||
Color string
|
||||
Values []float64
|
||||
}
|
||||
|
||||
var metricChartPalette = []string{
|
||||
"#5794f2",
|
||||
"#73bf69",
|
||||
"#f2cc0c",
|
||||
"#ff9830",
|
||||
"#f2495c",
|
||||
"#b877d9",
|
||||
"#56d2f7",
|
||||
"#8ab8ff",
|
||||
"#9adf8f",
|
||||
"#ffbe5c",
|
||||
}
|
||||
|
||||
var gpuLabelCache struct {
|
||||
mu sync.Mutex
|
||||
loadedAt time.Time
|
||||
byIndex map[int]string
|
||||
}
|
||||
|
||||
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{time.Time{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range datasets {
|
||||
if len(datasets[i]) == 0 {
|
||||
datasets[i] = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points (one per pixel) before building SVG.
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
statsLabel := chartStatsLabel(datasets)
|
||||
|
||||
legendItems := []metricChartSeries{}
|
||||
for i, name := range names {
|
||||
color := metricChartPalette[i%len(metricChartPalette)]
|
||||
values := make([]float64, pointCount)
|
||||
if i < len(datasets) {
|
||||
copy(values, coalesceDataset(datasets[i], pointCount))
|
||||
}
|
||||
legendItems = append(legendItems, metricChartSeries{
|
||||
Name: name,
|
||||
Color: color,
|
||||
Values: values,
|
||||
})
|
||||
}
|
||||
|
||||
scale := singleAxisChartScale(datasets, yMin, yMax)
|
||||
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
writeSingleAxisY(&b, layout, scale)
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
for _, item := range legendItems {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
|
||||
}
|
||||
writeLegend(&b, layout, legendItems)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||
if temp == nil && power == nil && coreClock == nil {
|
||||
return nil, false, nil
|
||||
}
|
||||
labels := sampleTimeLabels(samples)
|
||||
times := sampleTimes(samples)
|
||||
svg, err := drawGPUOverviewChartSVG(
|
||||
gpuDisplayLabel(idx)+" Overview",
|
||||
labels,
|
||||
times,
|
||||
[]metricChartSeries{
|
||||
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
|
||||
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
|
||||
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
|
||||
},
|
||||
timeline,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
return svg, true, nil
|
||||
}
|
||||
|
||||
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
if len(series) != 3 {
|
||||
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
|
||||
}
|
||||
const (
|
||||
width = 1400
|
||||
height = 840
|
||||
plotLeft = 180
|
||||
plotRight = 1220
|
||||
plotTop = 96
|
||||
plotBottom = 660
|
||||
)
|
||||
const (
|
||||
leftOuterAxis = 72
|
||||
leftInnerAxis = 132
|
||||
rightInnerAxis = 1268
|
||||
)
|
||||
layout := chartLayout{
|
||||
Width: width,
|
||||
Height: height,
|
||||
PlotLeft: plotLeft,
|
||||
PlotRight: plotRight,
|
||||
PlotTop: plotTop,
|
||||
PlotBottom: plotBottom,
|
||||
}
|
||||
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
if pointCount == 0 {
|
||||
pointCount = 1
|
||||
labels = []string{""}
|
||||
times = []time.Time{time.Time{}}
|
||||
}
|
||||
if len(labels) < pointCount {
|
||||
padded := make([]string, pointCount)
|
||||
copy(padded, labels)
|
||||
labels = padded
|
||||
}
|
||||
if len(times) < pointCount {
|
||||
times = synthesizeChartTimes(times, pointCount)
|
||||
}
|
||||
for i := range series {
|
||||
if len(series[i].Values) == 0 {
|
||||
series[i].Values = make([]float64, pointCount)
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points before building SVG.
|
||||
{
|
||||
datasets := make([][]float64, len(series))
|
||||
for i := range series {
|
||||
datasets[i] = series[i].Values
|
||||
}
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
for i := range series {
|
||||
series[i].Values = datasets[i]
|
||||
}
|
||||
}
|
||||
|
||||
scales := make([]chartScale, len(series))
|
||||
for i := range series {
|
||||
min, max := chartSeriesBounds(series[i].Values)
|
||||
ticks := chartNiceTicks(min, max, 8)
|
||||
scales[i] = chartScale{
|
||||
Min: ticks[0],
|
||||
Max: ticks[len(ticks)-1],
|
||||
Ticks: ticks,
|
||||
}
|
||||
}
|
||||
start, end := chartTimeBounds(times)
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, width, height)
|
||||
writeChartFrame(&b, title, "", width, height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scales[0])
|
||||
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||
writePlotBorder(&b, layout)
|
||||
|
||||
for i, axisLineX := range axisX {
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
|
||||
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
|
||||
for _, tick := range scales[i].Ticks {
|
||||
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
|
||||
label := sanitizeChartText(chartYAxisNumber(tick))
|
||||
if i < 2 {
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, y, axisLineX+6, y, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||
axisLineX-8, y, series[i].Color, label)
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||
axisLineX, y, axisLineX-6, y, series[i].Color)
|
||||
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||
axisLineX+8, y, series[i].Color, label)
|
||||
}
|
||||
}
|
||||
|
||||
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||
for i := range series {
|
||||
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
|
||||
}
|
||||
writeLegend(&b, layout, series)
|
||||
writeSVGClose(&b)
|
||||
return []byte(b.String()), nil
|
||||
}
|
||||
|
||||
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
|
||||
if len(samples) == 0 {
|
||||
return nil
|
||||
}
|
||||
times := sampleTimes(samples)
|
||||
start, end := chartTimeBounds(times)
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil
|
||||
}
|
||||
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
|
||||
}
|
||||
|
||||
func snapshotTaskHistory() []Task {
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
out := make([]Task, len(globalQueue.tasks))
|
||||
for i, t := range globalQueue.tasks {
|
||||
out[i] = *t
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil
|
||||
}
|
||||
if end.Before(start) {
|
||||
start, end = end, start
|
||||
}
|
||||
type interval struct {
|
||||
start time.Time
|
||||
end time.Time
|
||||
}
|
||||
active := make([]interval, 0, len(tasks))
|
||||
for _, task := range tasks {
|
||||
if task.StartedAt == nil {
|
||||
continue
|
||||
}
|
||||
intervalStart := task.StartedAt.UTC()
|
||||
intervalEnd := now.UTC()
|
||||
if task.DoneAt != nil {
|
||||
intervalEnd = task.DoneAt.UTC()
|
||||
}
|
||||
if !intervalEnd.After(intervalStart) {
|
||||
continue
|
||||
}
|
||||
if intervalEnd.Before(start) || intervalStart.After(end) {
|
||||
continue
|
||||
}
|
||||
if intervalStart.Before(start) {
|
||||
intervalStart = start
|
||||
}
|
||||
if intervalEnd.After(end) {
|
||||
intervalEnd = end
|
||||
}
|
||||
active = append(active, interval{start: intervalStart, end: intervalEnd})
|
||||
}
|
||||
sort.Slice(active, func(i, j int) bool {
|
||||
if active[i].start.Equal(active[j].start) {
|
||||
return active[i].end.Before(active[j].end)
|
||||
}
|
||||
return active[i].start.Before(active[j].start)
|
||||
})
|
||||
merged := make([]interval, 0, len(active))
|
||||
for _, span := range active {
|
||||
if len(merged) == 0 {
|
||||
merged = append(merged, span)
|
||||
continue
|
||||
}
|
||||
last := &merged[len(merged)-1]
|
||||
if !span.start.After(last.end) {
|
||||
if span.end.After(last.end) {
|
||||
last.end = span.end
|
||||
}
|
||||
continue
|
||||
}
|
||||
merged = append(merged, span)
|
||||
}
|
||||
|
||||
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
|
||||
cursor := start
|
||||
for _, span := range merged {
|
||||
if span.start.After(cursor) {
|
||||
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
|
||||
}
|
||||
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
|
||||
cursor = span.end
|
||||
}
|
||||
if cursor.Before(end) {
|
||||
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
|
||||
}
|
||||
if len(segments) == 0 {
|
||||
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
|
||||
}
|
||||
return segments
|
||||
}
|
||||
|
||||
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
|
||||
times := make([]time.Time, 0, len(samples))
|
||||
for _, sample := range samples {
|
||||
times = append(times, sample.Timestamp)
|
||||
}
|
||||
return times
|
||||
}
|
||||
|
||||
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
|
||||
min, max := 0.0, 1.0
|
||||
if yMin != nil && yMax != nil {
|
||||
min, max = *yMin, *yMax
|
||||
} else {
|
||||
min, max = chartSeriesBounds(flattenDatasets(datasets))
|
||||
if yMin != nil {
|
||||
min = *yMin
|
||||
}
|
||||
if yMax != nil {
|
||||
max = *yMax
|
||||
}
|
||||
}
|
||||
ticks := chartNiceTicks(min, max, 8)
|
||||
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
|
||||
}
|
||||
|
||||
func flattenDatasets(datasets [][]float64) []float64 {
|
||||
total := 0
|
||||
for _, ds := range datasets {
|
||||
total += len(ds)
|
||||
}
|
||||
out := make([]float64, 0, total)
|
||||
for _, ds := range datasets {
|
||||
out = append(out, ds...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
|
||||
legendRows := 0
|
||||
if chartLegendVisible(seriesCount) && seriesCount > 0 {
|
||||
cols := 4
|
||||
if seriesCount < cols {
|
||||
cols = seriesCount
|
||||
}
|
||||
legendRows = (seriesCount + cols - 1) / cols
|
||||
}
|
||||
legendHeight := 0
|
||||
if legendRows > 0 {
|
||||
legendHeight = legendRows*24 + 24
|
||||
}
|
||||
return chartLayout{
|
||||
Width: 1400,
|
||||
Height: canvasHeight,
|
||||
PlotLeft: 96,
|
||||
PlotRight: 1352,
|
||||
PlotTop: 72,
|
||||
PlotBottom: canvasHeight - 60 - legendHeight,
|
||||
}
|
||||
}
|
||||
|
||||
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
|
||||
if len(times) == 0 {
|
||||
return time.Time{}, time.Time{}
|
||||
}
|
||||
start := times[0].UTC()
|
||||
end := start
|
||||
for _, ts := range times[1:] {
|
||||
t := ts.UTC()
|
||||
if t.Before(start) {
|
||||
start = t
|
||||
}
|
||||
if t.After(end) {
|
||||
end = t
|
||||
}
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||
if count <= 0 {
|
||||
return nil
|
||||
}
|
||||
if len(times) == count {
|
||||
return times
|
||||
}
|
||||
if len(times) == 1 {
|
||||
out := make([]time.Time, count)
|
||||
for i := range out {
|
||||
out[i] = times[0].Add(time.Duration(i) * time.Minute)
|
||||
}
|
||||
return out
|
||||
}
|
||||
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
|
||||
out := make([]time.Time, count)
|
||||
for i := range out {
|
||||
out[i] = base.Add(time.Duration(i) * time.Minute)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||
}
|
||||
|
||||
func writeSVGClose(b *strings.Builder) {
|
||||
b.WriteString("</svg>\n")
|
||||
}
|
||||
|
||||
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(title))
|
||||
if strings.TrimSpace(subtitle) != "" {
|
||||
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(subtitle))
|
||||
}
|
||||
}
|
||||
|
||||
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
|
||||
}
|
||||
|
||||
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
|
||||
for _, tick := range scale.Ticks {
|
||||
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||
layout.PlotLeft, y, layout.PlotRight, y)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
|
||||
if pointCount <= 0 {
|
||||
return
|
||||
}
|
||||
start, end := chartTimeBounds(times)
|
||||
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
|
||||
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||
ts := chartPointTime(times, idx)
|
||||
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||
x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
|
||||
for _, tick := range scale.Ticks {
|
||||
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||
layout.PlotLeft, y, layout.PlotLeft-6, y)
|
||||
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
|
||||
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
|
||||
}
|
||||
}
|
||||
|
||||
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
}
|
||||
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
|
||||
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
label := ""
|
||||
if idx < len(labels) {
|
||||
label = labels[idx]
|
||||
}
|
||||
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
|
||||
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
|
||||
}
|
||||
|
||||
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
|
||||
if len(values) == 0 {
|
||||
return
|
||||
}
|
||||
var points strings.Builder
|
||||
for idx, value := range values {
|
||||
if idx > 0 {
|
||||
points.WriteByte(' ')
|
||||
}
|
||||
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
|
||||
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||
points.WriteByte(',')
|
||||
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||
}
|
||||
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
|
||||
points.String(), color)
|
||||
if len(values) == 1 {
|
||||
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||
return
|
||||
}
|
||||
peakIdx := 0
|
||||
peakValue := values[0]
|
||||
for idx, value := range values[1:] {
|
||||
if value >= peakValue {
|
||||
peakIdx = idx + 1
|
||||
peakValue = value
|
||||
}
|
||||
}
|
||||
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||
}
|
||||
|
||||
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||
if !chartLegendVisible(len(series)) || len(series) == 0 {
|
||||
return
|
||||
}
|
||||
cols := 4
|
||||
if len(series) < cols {
|
||||
cols = len(series)
|
||||
}
|
||||
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
|
||||
baseY := layout.PlotBottom + 74
|
||||
for i, item := range series {
|
||||
row := i / cols
|
||||
col := i % cols
|
||||
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
|
||||
y := float64(baseY + row*24)
|
||||
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
|
||||
x, y, x+28, y, item.Color)
|
||||
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
|
||||
x+38, y+4, sanitizeChartText(item.Name))
|
||||
}
|
||||
}
|
||||
|
||||
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||
if len(segments) == 0 {
|
||||
return
|
||||
}
|
||||
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
|
||||
for _, segment := range segments {
|
||||
if segment.Active || !segment.End.After(segment.Start) {
|
||||
continue
|
||||
}
|
||||
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
|
||||
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
|
||||
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||
if len(segments) == 0 {
|
||||
return
|
||||
}
|
||||
seen := map[int]bool{}
|
||||
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
|
||||
for i, segment := range segments {
|
||||
if i > 0 {
|
||||
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||
if !seen[x] {
|
||||
seen[x] = true
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
}
|
||||
if i < len(segments)-1 {
|
||||
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||
if !seen[x] {
|
||||
seen[x] = true
|
||||
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||
}
|
||||
}
|
||||
}
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
// downsampleTimeSeries reduces the time series to at most maxPts points using
|
||||
// min-max bucketing. Each bucket contributes the index of its min and max value
|
||||
// (using the first full-length dataset as the reference series). All parallel
|
||||
// datasets are sampled at those same indices so all series stay aligned.
|
||||
// If len(times) <= maxPts the inputs are returned unchanged.
|
||||
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
|
||||
n := len(times)
|
||||
if n <= maxPts || maxPts <= 0 {
|
||||
return times, datasets
|
||||
}
|
||||
buckets := maxPts / 2
|
||||
if buckets < 1 {
|
||||
buckets = 1
|
||||
}
|
||||
// Use the first dataset that has the same length as times as the reference
|
||||
// for deciding which two indices to keep per bucket.
|
||||
var ref []float64
|
||||
for _, ds := range datasets {
|
||||
if len(ds) == n {
|
||||
ref = ds
|
||||
break
|
||||
}
|
||||
}
|
||||
selected := make([]int, 0, maxPts)
|
||||
bucketSize := float64(n) / float64(buckets)
|
||||
for b := 0; b < buckets; b++ {
|
||||
lo := int(math.Round(float64(b) * bucketSize))
|
||||
hi := int(math.Round(float64(b+1) * bucketSize))
|
||||
if hi > n {
|
||||
hi = n
|
||||
}
|
||||
if lo >= hi {
|
||||
continue
|
||||
}
|
||||
if ref == nil {
|
||||
selected = append(selected, lo)
|
||||
if hi-1 != lo {
|
||||
selected = append(selected, hi-1)
|
||||
}
|
||||
continue
|
||||
}
|
||||
minIdx, maxIdx := lo, lo
|
||||
for i := lo + 1; i < hi; i++ {
|
||||
if ref[i] < ref[minIdx] {
|
||||
minIdx = i
|
||||
}
|
||||
if ref[i] > ref[maxIdx] {
|
||||
maxIdx = i
|
||||
}
|
||||
}
|
||||
if minIdx <= maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
if maxIdx != minIdx {
|
||||
selected = append(selected, maxIdx)
|
||||
}
|
||||
} else {
|
||||
selected = append(selected, maxIdx)
|
||||
if minIdx != maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
}
|
||||
}
|
||||
}
|
||||
outTimes := make([]time.Time, len(selected))
|
||||
for i, idx := range selected {
|
||||
outTimes[i] = times[idx]
|
||||
}
|
||||
outDatasets := make([][]float64, len(datasets))
|
||||
for d, ds := range datasets {
|
||||
if len(ds) != n {
|
||||
outDatasets[d] = ds
|
||||
continue
|
||||
}
|
||||
out := make([]float64, len(selected))
|
||||
for i, idx := range selected {
|
||||
out[i] = ds[idx]
|
||||
}
|
||||
outDatasets[d] = out
|
||||
}
|
||||
return outTimes, outDatasets
|
||||
}
|
||||
|
||||
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||
if !end.After(start) {
|
||||
return float64(left+right) / 2
|
||||
}
|
||||
if ts.Before(start) {
|
||||
ts = start
|
||||
}
|
||||
if ts.After(end) {
|
||||
ts = end
|
||||
}
|
||||
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
|
||||
return float64(left) + ratio*float64(right-left)
|
||||
}
|
||||
|
||||
func chartPointTime(times []time.Time, idx int) time.Time {
|
||||
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
|
||||
return times[idx].UTC()
|
||||
}
|
||||
if len(times) > 0 && !times[0].IsZero() {
|
||||
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
|
||||
}
|
||||
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
|
||||
}
|
||||
|
||||
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
|
||||
if scale.Max <= scale.Min {
|
||||
return float64(plotTop+plotBottom) / 2
|
||||
}
|
||||
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
|
||||
}
|
||||
|
||||
func chartSeriesBounds(values []float64) (float64, float64) {
|
||||
if len(values) == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
min, max := values[0], values[0]
|
||||
for _, value := range values[1:] {
|
||||
if value < min {
|
||||
min = value
|
||||
}
|
||||
if value > max {
|
||||
max = value
|
||||
}
|
||||
}
|
||||
if min == max {
|
||||
if max == 0 {
|
||||
return 0, 1
|
||||
}
|
||||
pad := math.Abs(max) * 0.1
|
||||
if pad == 0 {
|
||||
pad = 1
|
||||
}
|
||||
min -= pad
|
||||
max += pad
|
||||
}
|
||||
if min > 0 {
|
||||
pad := (max - min) * 0.2
|
||||
if pad == 0 {
|
||||
pad = max * 0.1
|
||||
}
|
||||
min -= pad
|
||||
if min < 0 {
|
||||
min = 0
|
||||
}
|
||||
max += pad
|
||||
}
|
||||
return min, max
|
||||
}
|
||||
|
||||
func chartNiceTicks(min, max float64, target int) []float64 {
|
||||
if min == max {
|
||||
max = min + 1
|
||||
}
|
||||
span := max - min
|
||||
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
|
||||
for _, factor := range []float64{1, 2, 5, 10} {
|
||||
if span/(factor*step) <= float64(target)*1.5 {
|
||||
step = factor * step
|
||||
break
|
||||
}
|
||||
}
|
||||
low := math.Floor(min/step) * step
|
||||
high := math.Ceil(max/step) * step
|
||||
var ticks []float64
|
||||
for value := low; value <= high+step*0.001; value += step {
|
||||
ticks = append(ticks, math.Round(value*1e9)/1e9)
|
||||
}
|
||||
return ticks
|
||||
}
|
||||
|
||||
func valueClamp(value float64, scale chartScale) float64 {
|
||||
if value < scale.Min {
|
||||
return scale.Min
|
||||
}
|
||||
if value > scale.Max {
|
||||
return scale.Max
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func chartStatsLabel(datasets [][]float64) string {
|
||||
mn, avg, mx := globalStats(datasets)
|
||||
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf("min %s avg %s max %s",
|
||||
chartLegendNumber(mn),
|
||||
chartLegendNumber(avg),
|
||||
chartLegendNumber(mx),
|
||||
)
|
||||
}
|
||||
|
||||
func gpuDisplayLabel(idx int) string {
|
||||
if name := gpuModelNameByIndex(idx); name != "" {
|
||||
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||
}
|
||||
return fmt.Sprintf("GPU %d", idx)
|
||||
}
|
||||
|
||||
func gpuModelNameByIndex(idx int) string {
|
||||
now := time.Now()
|
||||
gpuLabelCache.mu.Lock()
|
||||
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||
gpuLabelCache.loadedAt = now
|
||||
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||
}
|
||||
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||
gpuLabelCache.mu.Unlock()
|
||||
return name
|
||||
}
|
||||
|
||||
func loadGPUModelNames() map[int]string {
|
||||
out := map[int]string{}
|
||||
gpus, err := platform.New().ListNvidiaGPUs()
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
for _, gpu := range gpus {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name != "" {
|
||||
out[gpu.Index] = name
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -9,13 +9,14 @@ import (
|
||||
|
||||
// jobState holds the output lines and completion status of an async job.
|
||||
type jobState struct {
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
|
||||
if j.logPath != "" {
|
||||
appendJobLog(j.logPath, line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
select {
|
||||
case ch <- line:
|
||||
@@ -84,12 +88,12 @@ func (m *jobManager) create(id string) *jobState {
|
||||
j := &jobState{}
|
||||
m.jobs[id] = j
|
||||
// Schedule cleanup after 30 minutes
|
||||
go func() {
|
||||
goRecoverOnce("job cleanup", func() {
|
||||
time.Sleep(30 * time.Minute)
|
||||
m.mu.Lock()
|
||||
delete(m.jobs, id)
|
||||
m.mu.Unlock()
|
||||
}()
|
||||
})
|
||||
return j
|
||||
}
|
||||
|
||||
@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
||||
return j, ok
|
||||
}
|
||||
|
||||
func newTaskJobState(logPath string) *jobState {
|
||||
func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||
j := &jobState{logPath: logPath}
|
||||
if len(serialPrefix) > 0 {
|
||||
j.serialPrefix = serialPrefix[0]
|
||||
}
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
|
||||
242
audit/internal/webui/kmsg_watcher.go
Normal file
242
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,242 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||
// while any SAT task is running, and flushed when all tasks complete.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeCount int // number of in-flight SAT tasks
|
||||
window *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
targets []string // SAT targets running concurrently
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
for {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
time.Sleep(30 * time.Second)
|
||||
continue
|
||||
}
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
_ = f.Close()
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.window.seen[key] {
|
||||
w.window.seen[key] = true
|
||||
w.window.events = append(w.window.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||
// if this is the first task starting.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if w.activeCount == 0 {
|
||||
w.window = &kmsgWindow{
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
w.activeCount++
|
||||
if w.window != nil {
|
||||
w.window.targets = append(w.window.targets, target)
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||
// it flushes the accumulated events to the status DB.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
w.activeCount--
|
||||
var window *kmsgWindow
|
||||
if w.activeCount <= 0 {
|
||||
w.activeCount = 0
|
||||
window = w.window
|
||||
w.window = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -4,7 +4,11 @@ import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -18,8 +22,18 @@ type MetricsDB struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
func (m *MetricsDB) Close() error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
return m.db.Close()
|
||||
}
|
||||
|
||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -48,6 +62,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
clock_mhz REAL,
|
||||
mem_clock_mhz REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||
@@ -64,6 +80,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||
return err
|
||||
}
|
||||
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
||||
}
|
||||
|
||||
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||
rows, err := db.Query("PRAGMA table_info(" + table + ")")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
var cid int
|
||||
var name, ctype string
|
||||
var notNull, pk int
|
||||
var dflt sql.NullString
|
||||
if err := rows.Scan(&cid, &name, &ctype, ¬Null, &dflt, &pk); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.EqualFold(name, column) {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -85,8 +133,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
_, err = tx.Exec(
|
||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
|
||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
|
||||
)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -115,7 +163,7 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
@@ -123,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// LoadBetween returns samples in chronological order within the given time window.
|
||||
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
if m == nil {
|
||||
return nil, nil
|
||||
}
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil, nil
|
||||
}
|
||||
if end.Before(start) {
|
||||
start, end = end, start
|
||||
}
|
||||
return m.loadSamples(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
start.Unix(), end.Unix(),
|
||||
)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(query, args...)
|
||||
@@ -132,7 +197,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
defer rows.Close()
|
||||
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
}
|
||||
var sysRows []sysRow
|
||||
@@ -146,20 +211,18 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
if len(sysRows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
// Reverse to chronological order
|
||||
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
||||
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
||||
}
|
||||
|
||||
// Collect min/max ts for range query
|
||||
minTS := sysRows[0].ts
|
||||
maxTS := sysRows[len(sysRows)-1].ts
|
||||
|
||||
// Load GPU rows in range
|
||||
type gpuKey struct{ ts int64; idx int }
|
||||
type gpuKey struct {
|
||||
ts int64
|
||||
idx int
|
||||
}
|
||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||
gRows, err := m.db.Query(
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
minTS, maxTS,
|
||||
)
|
||||
if err == nil {
|
||||
@@ -167,14 +230,17 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
for gRows.Next() {
|
||||
var ts int64
|
||||
var g platform.GPUMetricRow
|
||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
|
||||
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Load fan rows in range
|
||||
type fanKey struct{ ts int64; name string }
|
||||
type fanKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
fanData := map[fanKey]float64{}
|
||||
fRows, err := m.db.Query(
|
||||
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
@@ -192,7 +258,10 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
|
||||
// Load temp rows in range
|
||||
type tempKey struct{ ts int64; name string }
|
||||
type tempKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
tempData := map[tempKey]platform.TempReading{}
|
||||
tRows, err := m.db.Query(
|
||||
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
@@ -208,7 +277,9 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||
// Sort each list so that sample reconstruction is deterministic regardless
|
||||
// of Go's non-deterministic map iteration order.
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
@@ -217,6 +288,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
sort.Ints(gpuIndices)
|
||||
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
@@ -225,6 +298,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(fanNames)
|
||||
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
@@ -233,6 +308,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(tempNames)
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
@@ -266,7 +342,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
rows, err := m.db.Query(`
|
||||
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
|
||||
g.clock_mhz, g.mem_clock_mhz
|
||||
FROM sys_metrics s
|
||||
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||
ORDER BY s.ts, g.gpu_index
|
||||
@@ -277,13 +354,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
defer rows.Close()
|
||||
|
||||
cw := csv.NewWriter(w)
|
||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
|
||||
for rows.Next() {
|
||||
var ts int64
|
||||
var cpu, mem, pwr float64
|
||||
var gpuIdx sql.NullInt64
|
||||
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
||||
var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
|
||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
|
||||
continue
|
||||
}
|
||||
row := []string{
|
||||
@@ -299,9 +376,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
|
||||
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
|
||||
)
|
||||
} else {
|
||||
row = append(row, "", "", "", "", "")
|
||||
row = append(row, "", "", "", "", "", "", "")
|
||||
}
|
||||
_ = cw.Write(row)
|
||||
}
|
||||
@@ -309,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
return cw.Error()
|
||||
}
|
||||
|
||||
// Close closes the database.
|
||||
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
||||
|
||||
func nullFloat(v float64) sql.NullFloat64 {
|
||||
return sql.NullFloat64{Float64: v, Valid: true}
|
||||
}
|
||||
|
||||
174
audit/internal/webui/metricsdb_test.go
Normal file
174
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
_ "modernc.org/sqlite"
|
||||
)
|
||||
|
||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 3; i++ {
|
||||
err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||
CPULoadPct: float64(10 + i),
|
||||
MemLoadPct: float64(20 + i),
|
||||
PowerW: float64(300 + i),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
all, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(all) != 3 {
|
||||
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||
}
|
||||
for i, sample := range all {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||
}
|
||||
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||
}
|
||||
}
|
||||
|
||||
recent, err := db.LoadRecent(2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadRecent: %v", err)
|
||||
}
|
||||
if len(recent) != 2 {
|
||||
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||
}
|
||||
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||
}
|
||||
for i, sample := range recent {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "metrics.db")
|
||||
raw, err := sql.Open("sqlite", path)
|
||||
if err != nil {
|
||||
t.Fatalf("sql.Open: %v", err)
|
||||
}
|
||||
_, err = raw.Exec(`
|
||||
CREATE TABLE gpu_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
gpu_index INTEGER NOT NULL,
|
||||
temp_c REAL,
|
||||
usage_pct REAL,
|
||||
mem_usage_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts, gpu_index)
|
||||
);
|
||||
CREATE TABLE sys_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
cpu_load_pct REAL,
|
||||
mem_load_pct REAL,
|
||||
power_w REAL,
|
||||
PRIMARY KEY (ts)
|
||||
);
|
||||
CREATE TABLE fan_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
rpm REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
CREATE TABLE temp_metrics (
|
||||
ts INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
grp TEXT NOT NULL,
|
||||
celsius REAL,
|
||||
PRIMARY KEY (ts, name)
|
||||
);
|
||||
`)
|
||||
if err != nil {
|
||||
t.Fatalf("create legacy schema: %v", err)
|
||||
}
|
||||
_ = raw.Close()
|
||||
|
||||
db, err := openMetricsDB(path)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
now := time.Unix(1_700_000_100, 0).UTC()
|
||||
err = db.Write(platform.LiveMetricSample{
|
||||
Timestamp: now,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
|
||||
samples, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
|
||||
t.Fatalf("samples=%+v", samples)
|
||||
}
|
||||
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
|
||||
t.Fatalf("ClockMHz=%v want 1410", got)
|
||||
}
|
||||
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
|
||||
t.Fatalf("MemClockMHz=%v want 2600", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 5; i++ {
|
||||
if err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Minute),
|
||||
CPULoadPct: float64(i),
|
||||
}); err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
|
||||
if err != nil {
|
||||
t.Fatalf("LoadBetween: %v", err)
|
||||
}
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("LoadBetween len=%d want 3", len(got))
|
||||
}
|
||||
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
|
||||
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
41
audit/internal/webui/serial_console.go
Normal file
41
audit/internal/webui/serial_console.go
Normal file
@@ -0,0 +1,41 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var taskSerialWriteLine = writeTaskSerialLine
|
||||
|
||||
func writeTaskSerialLine(line string) {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
return
|
||||
}
|
||||
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
|
||||
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
|
||||
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
_, _ = f.WriteString(payload)
|
||||
_ = f.Close()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func taskSerialPrefix(t *Task) string {
|
||||
if t == nil {
|
||||
return "[task] "
|
||||
}
|
||||
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
|
||||
}
|
||||
|
||||
func taskSerialEvent(t *Task, event string) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,7 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
@@ -34,6 +35,49 @@ func TestChartLegendNumber(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
|
||||
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
panic("boom")
|
||||
}))
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
|
||||
|
||||
handler.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusInternalServerError {
|
||||
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), "internal server error") {
|
||||
t.Fatalf("body=%q", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
|
||||
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
if !sseWrite(w, "tick", "ok") {
|
||||
t.Fatal("expected sse write to succeed")
|
||||
}
|
||||
}))
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, "/stream", nil)
|
||||
|
||||
handler.ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
|
||||
t.Fatalf("content-type=%q", got)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
|
||||
t.Fatalf("body=%q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
@@ -89,6 +133,331 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 7, PowerW: 170},
|
||||
{GPUIndex: 2, PowerW: 120},
|
||||
{GPUIndex: 0, PowerW: 100},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: 101},
|
||||
{GPUIndex: 7, PowerW: 171},
|
||||
{GPUIndex: 2, PowerW: 121},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||
if len(names) != len(wantNames) {
|
||||
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||
}
|
||||
for i := range wantNames {
|
||||
if names[i] != wantNames[i] {
|
||||
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||
}
|
||||
}
|
||||
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||
}
|
||||
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||
}
|
||||
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, ClockMHz: 1400},
|
||||
{GPUIndex: 3, ClockMHz: 1500},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, ClockMHz: 1410},
|
||||
{GPUIndex: 3, ClockMHz: 1510},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||
if !ok {
|
||||
t.Fatal("gpu-all-clock returned ok=false")
|
||||
}
|
||||
if title != "GPU Core Clock" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if got := datasets[1][1]; got != 1510 {
|
||||
t.Fatalf("GPU 3 core clock=%v want 1510", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||
want := []float64{0, 480, 480, 480, 510, 510}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||
body := renderMetrics()
|
||||
if !strings.Contains(body, "const probe = new Image();") {
|
||||
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
|
||||
t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="gpu-chart-toggle"`) {
|
||||
t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
|
||||
t.Fatalf("metrics page should include GPU core clock chart: %s", body)
|
||||
}
|
||||
if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
||||
t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
|
||||
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartLegendVisible(t *testing.T) {
|
||||
if !chartLegendVisible(8) {
|
||||
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||
}
|
||||
if chartLegendVisible(9) {
|
||||
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartYAxisNumber(t *testing.T) {
|
||||
tests := []struct {
|
||||
in float64
|
||||
want string
|
||||
}{
|
||||
{in: 999, want: "999"},
|
||||
{in: 1000, want: "1к"},
|
||||
{in: 1370, want: "1,4к"},
|
||||
{in: 1500, want: "1,5к"},
|
||||
{in: 1700, want: "1,7к"},
|
||||
{in: 2000, want: "2к"},
|
||||
{in: 9999, want: "10к"},
|
||||
{in: 10200, want: "10к"},
|
||||
{in: -1500, want: "-1,5к"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartCanvasHeight(t *testing.T) {
|
||||
if got := chartCanvasHeight(4); got != 360 {
|
||||
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||
}
|
||||
if got := chartCanvasHeight(12); got != 288 {
|
||||
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
|
||||
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||
end := start.Add(10 * time.Minute)
|
||||
taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
|
||||
s := start.Add(offsetStart)
|
||||
e := start.Add(offsetEnd)
|
||||
return Task{
|
||||
Name: "task",
|
||||
Status: TaskDone,
|
||||
StartedAt: &s,
|
||||
DoneAt: &e,
|
||||
}
|
||||
}
|
||||
segments := chartTimelineSegmentsForRange(start, end, end, []Task{
|
||||
taskWindow(1*time.Minute, 3*time.Minute),
|
||||
taskWindow(2*time.Minute, 5*time.Minute),
|
||||
taskWindow(7*time.Minute, 8*time.Minute),
|
||||
})
|
||||
if len(segments) != 5 {
|
||||
t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
|
||||
}
|
||||
wantActive := []bool{false, true, false, true, false}
|
||||
wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
|
||||
for i, segment := range segments {
|
||||
if segment.Active != wantActive[i] {
|
||||
t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
|
||||
}
|
||||
if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
|
||||
t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
|
||||
}
|
||||
if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
|
||||
t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
|
||||
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||
labels := []string{"12:00", "12:01", "12:02"}
|
||||
times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
|
||||
svg, err := renderMetricChartSVG(
|
||||
"System Power",
|
||||
labels,
|
||||
times,
|
||||
[][]float64{{300, 320, 310}},
|
||||
[]string{"Power W"},
|
||||
floatPtr(0),
|
||||
floatPtr(400),
|
||||
360,
|
||||
[]chartTimelineSegment{
|
||||
{Start: start, End: start.Add(time.Minute), Active: false},
|
||||
{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
body := string(svg)
|
||||
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||
t.Fatalf("svg missing timeline overlay: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `opacity="0.10"`) {
|
||||
t.Fatalf("svg missing idle overlay opacity: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `System Power`) {
|
||||
t.Fatalf("svg missing chart title: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Cleanup(func() { _ = db.db.Close() })
|
||||
|
||||
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||
for i, sample := range []platform.LiveMetricSample{
|
||||
{Timestamp: start, PowerW: 300},
|
||||
{Timestamp: start.Add(time.Minute), PowerW: 320},
|
||||
{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
|
||||
} {
|
||||
if err := db.Write(sample); err != nil {
|
||||
t.Fatalf("write sample %d: %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
prevTasks := globalQueue.tasks
|
||||
s := start.Add(30 * time.Second)
|
||||
e := start.Add(90 * time.Second)
|
||||
globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = prevTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
|
||||
h.handleMetricsChartSVG(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||
t.Fatalf("custom svg response missing timeline overlay: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `stroke-linecap="round"`) {
|
||||
t.Fatalf("custom svg response missing custom polyline styling: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||
r1 := newMetricsRing(4)
|
||||
r2 := newMetricsRing(4)
|
||||
r1.push(1000)
|
||||
r1.push(1100)
|
||||
r2.push(1200)
|
||||
r2.push(1300)
|
||||
|
||||
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||
if len(datasets) != 2 {
|
||||
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||
}
|
||||
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != 2 {
|
||||
t.Fatalf("labels=%v want 2 entries", labels)
|
||||
}
|
||||
if labels[0] == "" || labels[1] == "" {
|
||||
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||
body := renderNetworkInline()
|
||||
if !strings.Contains(body, "d.pending_change") {
|
||||
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -101,9 +470,10 @@ func TestRootRendersDashboard(t *testing.T) {
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
Title: "Bee Hardware Audit",
|
||||
BuildLabel: "1.2.3",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
first := httptest.NewRecorder()
|
||||
@@ -118,6 +488,11 @@ func TestRootRendersDashboard(t *testing.T) {
|
||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||
}
|
||||
versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
|
||||
navIdx := strings.Index(first.Body.String(), `href="/"`)
|
||||
if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
|
||||
t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
|
||||
}
|
||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||
t.Fatalf("first cache-control=%q", got)
|
||||
}
|
||||
@@ -155,7 +530,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Run Audit`) {
|
||||
if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
|
||||
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||
}
|
||||
if strings.Contains(body, `No audit data`) {
|
||||
@@ -163,6 +538,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if strings.TrimSpace(rec.Body.String()) != "ready" {
|
||||
t.Fatalf("body=%q want ready", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -185,6 +572,336 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
|
||||
t.Fatalf("tasks page missing task report hint: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
|
||||
t.Fatalf("tasks page missing pagination controls: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Export to USB`) {
|
||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`href="/benchmark"`,
|
||||
`id="benchmark-gpu-list"`,
|
||||
`/api/gpu/nvidia`,
|
||||
`/api/benchmark/nvidia/run`,
|
||||
`benchmark-run-nccl`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
result := platform.NvidiaBenchmarkResult{
|
||||
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||
BenchmarkProfile: "standard",
|
||||
OverallStatus: "OK",
|
||||
GPUs: []platform.BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1176.25,
|
||||
},
|
||||
},
|
||||
{
|
||||
Index: 1,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1168.50,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score by saved benchmark run and GPU.`,
|
||||
`GPU 0`,
|
||||
`GPU 1`,
|
||||
`#1`,
|
||||
wantTime,
|
||||
`1176.25`,
|
||||
`1168.50`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA GPU Targeted Stress`,
|
||||
`nvidia-targeted-stress`,
|
||||
`controlled NVIDIA DCGM load`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`NVIDIA GPU Selection`,
|
||||
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
||||
`Select All`,
|
||||
`id="sat-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Max Compute Load`,
|
||||
`dcgmproftester`,
|
||||
`NCCL`,
|
||||
`Validate → Stress mode`,
|
||||
`id="burn-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("burn page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskDetailPageRendersSavedReport(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
|
||||
if err := os.MkdirAll(reportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
reportPath := filepath.Join(reportDir, "report.html")
|
||||
if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "task-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
ArtifactsDir: reportDir,
|
||||
ReportHTMLPath: reportPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `saved report`) {
|
||||
t.Fatalf("task detail page missing saved report: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Back to Tasks`) {
|
||||
t.Fatalf("task detail page missing back link: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "task-live-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Cancel</button>`) {
|
||||
t.Fatalf("task detail page missing cancel button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `function cancelTaskDetail(id)`) {
|
||||
t.Fatalf("task detail page missing cancel handler: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
|
||||
t.Fatalf("task detail page missing cancel endpoint: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="task-live-charts"`) {
|
||||
t.Fatalf("task detail page missing live charts container: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
|
||||
t.Fatalf("task detail page missing live charts index endpoint: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
db, err := openMetricsDB(metricsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
base := time.Now().UTC()
|
||||
samples := []platform.LiveMetricSample{
|
||||
{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
|
||||
{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
|
||||
{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
|
||||
}
|
||||
for _, sample := range samples {
|
||||
if err := db.Write(sample); err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
}
|
||||
_ = db.Close()
|
||||
|
||||
started := base.Add(-2*time.Minute - 5*time.Second)
|
||||
done := base.Add(-1*time.Minute + 5*time.Second)
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "task-chart-1",
|
||||
Name: "Power Window",
|
||||
Target: "cpu",
|
||||
Status: TaskDone,
|
||||
CreatedAt: started.Add(-10 * time.Second),
|
||||
StartedAt: &started,
|
||||
DoneAt: &done,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
|
||||
req.SetPathValue("id", "task-chart-1")
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, req)
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "System Power") {
|
||||
t.Fatalf("task chart missing expected title: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "min 200") {
|
||||
t.Fatalf("task chart stats should start from in-window sample: %s", body)
|
||||
}
|
||||
if strings.Contains(body, "min 100") {
|
||||
t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -259,6 +976,17 @@ func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
archive, err := os.CreateTemp(os.TempDir(), "bee-support-server-test-*.tar.gz")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Cleanup(func() { _ = os.Remove(archive.Name()) })
|
||||
if _, err := archive.WriteString("support-bundle"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := archive.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
@@ -295,3 +1023,101 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
|
||||
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
health := `{
|
||||
"status":"PARTIAL",
|
||||
"checked_at":"2026-03-16T10:00:00Z",
|
||||
"export_dir":"/tmp/export",
|
||||
"driver_ready":true,
|
||||
"cuda_ready":false,
|
||||
"network_status":"PARTIAL",
|
||||
"issues":[
|
||||
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
|
||||
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
|
||||
],
|
||||
"tools":[
|
||||
{"name":"dmidecode","ok":true},
|
||||
{"name":"nvidia-smi","ok":false}
|
||||
],
|
||||
"services":[
|
||||
{"name":"bee-web","status":"active"},
|
||||
{"name":"bee-nvidia","status":"inactive"}
|
||||
]
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
componentStatus := `[
|
||||
{
|
||||
"component_key":"cpu:all",
|
||||
"status":"Warning",
|
||||
"error_summary":"cpu SAT: FAILED",
|
||||
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
|
||||
},
|
||||
{
|
||||
"component_key":"memory:all",
|
||||
"status":"OK",
|
||||
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
|
||||
},
|
||||
{
|
||||
"component_key":"storage:nvme0n1",
|
||||
"status":"Critical",
|
||||
"error_summary":"storage SAT: FAILED",
|
||||
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
|
||||
},
|
||||
{
|
||||
"component_key":"pcie:gpu:nvidia",
|
||||
"status":"Warning",
|
||||
"error_summary":"nvidia SAT: FAILED",
|
||||
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
|
||||
}
|
||||
]`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
// Runtime Health card — LiveCD checks only
|
||||
`Runtime Health`,
|
||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||
`Export Directory`,
|
||||
`Network`,
|
||||
`NVIDIA/AMD Driver`,
|
||||
`CUDA / ROCm`,
|
||||
`Required Utilities`,
|
||||
`Bee Services`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
`>Memory<`,
|
||||
`>Storage<`,
|
||||
`>GPU<`,
|
||||
`>PSU<`,
|
||||
`badge-warn`, // cpu Warning badge
|
||||
`badge-err`, // storage Critical badge
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
42
audit/internal/webui/stability.go
Normal file
42
audit/internal/webui/stability.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"runtime/debug"
|
||||
"time"
|
||||
)
|
||||
|
||||
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||
go func() {
|
||||
for {
|
||||
if !runRecoverable(name, fn) {
|
||||
return
|
||||
}
|
||||
if restartDelay > 0 {
|
||||
time.Sleep(restartDelay)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func goRecoverOnce(name string, fn func()) {
|
||||
go func() {
|
||||
_ = runRecoverable(name, fn)
|
||||
}()
|
||||
}
|
||||
|
||||
func runRecoverable(name string, fn func()) (panicked bool) {
|
||||
defer func() {
|
||||
if rec := recover(); rec != nil {
|
||||
panicked = true
|
||||
slog.Error("recovered panic",
|
||||
"component", name,
|
||||
"panic", fmt.Sprint(rec),
|
||||
"stack", string(debug.Stack()),
|
||||
)
|
||||
}
|
||||
}()
|
||||
fn()
|
||||
return false
|
||||
}
|
||||
267
audit/internal/webui/task_page.go
Normal file
267
audit/internal/webui/task_page.go
Normal file
@@ -0,0 +1,267 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
task, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
snapshot := *task
|
||||
body := renderTaskDetailPage(h.opts, snapshot)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write([]byte(body))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
type taskChartIndexEntry struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
entries := make([]taskChartIndexEntry, 0)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
_ = json.NewEncoder(w).Encode(entries)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
|
||||
path, ok := taskChartPathFromFile(file)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
|
||||
if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||
title := task.Name
|
||||
if strings.TrimSpace(title) == "" {
|
||||
title = task.ID
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
|
||||
}
|
||||
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||
body.WriteString(`</div>`)
|
||||
|
||||
if report := loadTaskReportFragment(task); report != "" {
|
||||
body.WriteString(report)
|
||||
} else {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
|
||||
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
|
||||
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
|
||||
if strings.TrimSpace(task.ErrMsg) != "" {
|
||||
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
|
||||
}
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
body.WriteString(`<script>
|
||||
function cancelTaskDetail(id) {
|
||||
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||
var term = document.getElementById('task-live-log');
|
||||
if (term) {
|
||||
term.textContent += '\nCancel requested.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
});
|
||||
}
|
||||
function renderTaskLiveCharts(taskId, charts) {
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (!host) return;
|
||||
if (!Array.isArray(charts) || charts.length === 0) {
|
||||
host.innerHTML = 'Waiting for metric samples...';
|
||||
return;
|
||||
}
|
||||
const seen = {};
|
||||
charts.forEach(function(chart) {
|
||||
seen[chart.file] = true;
|
||||
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||
if (img) {
|
||||
const card = img.closest('.card');
|
||||
if (card) {
|
||||
const title = card.querySelector('.card-head');
|
||||
if (title) title.textContent = chart.title;
|
||||
}
|
||||
return;
|
||||
}
|
||||
const card = document.createElement('div');
|
||||
card.className = 'card';
|
||||
card.style.margin = '0';
|
||||
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||
card.querySelector('.card-head').textContent = chart.title;
|
||||
const body = card.querySelector('.card-body');
|
||||
img = document.createElement('img');
|
||||
img.setAttribute('data-task-chart', '1');
|
||||
img.setAttribute('data-chart-file', chart.file);
|
||||
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||
img.style.width = '100%';
|
||||
img.style.display = 'block';
|
||||
img.style.borderRadius = '6px';
|
||||
img.alt = chart.title;
|
||||
body.appendChild(img);
|
||||
host.appendChild(card);
|
||||
});
|
||||
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||
const file = img.getAttribute('data-chart-file') || '';
|
||||
if (seen[file]) return;
|
||||
const card = img.closest('.card');
|
||||
if (card) card.remove();
|
||||
});
|
||||
}
|
||||
function loadTaskLiveCharts(taskId) {
|
||||
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||
renderTaskLiveCharts(taskId, charts);
|
||||
}).catch(function(){
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||
});
|
||||
}
|
||||
function refreshTaskLiveCharts() {
|
||||
document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
|
||||
const base = img.dataset.baseSrc;
|
||||
if (!base) return;
|
||||
img.src = base + '?t=' + Date.now();
|
||||
});
|
||||
}
|
||||
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||
var _taskChartTimer = null;
|
||||
var _taskChartsFrozen = false;
|
||||
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||
_taskDetailES.addEventListener('done', function(e){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
_taskChartsFrozen = true;
|
||||
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||
refreshTaskLiveCharts();
|
||||
});
|
||||
_taskDetailES.onerror = function(){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
if (_taskDetailES) {
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
}
|
||||
};
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
_taskChartTimer = setInterval(function(){
|
||||
if (_taskChartsFrozen) return;
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
refreshTaskLiveCharts();
|
||||
}, 2000);
|
||||
</script>`)
|
||||
}
|
||||
|
||||
return layoutHead(opts.Title+" — "+title) +
|
||||
layoutNav("tasks", opts.BuildLabel) +
|
||||
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||
body.String() +
|
||||
`</div></div></body></html>`
|
||||
}
|
||||
|
||||
func loadTaskReportFragment(task Task) string {
|
||||
if strings.TrimSpace(task.ReportHTMLPath) == "" {
|
||||
return ""
|
||||
}
|
||||
data, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return ""
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func taskArtifactDownloadLink(task Task, absPath string) string {
|
||||
if strings.TrimSpace(absPath) == "" {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||
}
|
||||
|
||||
func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
|
||||
id := r.PathValue("id")
|
||||
taskPtr, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
return Task{}, nil, time.Time{}, time.Time{}, false
|
||||
}
|
||||
task := *taskPtr
|
||||
start, end := taskTimeWindow(&task)
|
||||
samples, err := loadTaskMetricSamples(start, end)
|
||||
if err != nil {
|
||||
return task, nil, start, end, true
|
||||
}
|
||||
return task, samples, start, end, true
|
||||
}
|
||||
|
||||
func taskTimelineForTask(task Task) []chartTimelineSegment {
|
||||
start, end := taskTimeWindow(&task)
|
||||
return []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
}
|
||||
|
||||
func taskChartPathFromFile(file string) (string, bool) {
|
||||
file = strings.TrimSpace(file)
|
||||
for _, spec := range taskDashboardChartSpecs {
|
||||
if spec.File == file {
|
||||
return spec.Path, true
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
|
||||
id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
|
||||
return "gpu/" + id + "-overview", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
343
audit/internal/webui/task_report.go
Normal file
343
audit/internal/webui/task_report.go
Normal file
@@ -0,0 +1,343 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
var taskReportMetricsDBPath = metricsDBPath
|
||||
|
||||
type taskReport struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
DurationSec int `json:"duration_sec,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
LogFile string `json:"log_file,omitempty"`
|
||||
Charts []taskReportChart `json:"charts,omitempty"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
}
|
||||
|
||||
type taskReportChart struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
|
||||
type taskChartSpec struct {
|
||||
Path string
|
||||
File string
|
||||
}
|
||||
|
||||
var taskDashboardChartSpecs = []taskChartSpec{
|
||||
{Path: "server-load", File: "server-load.svg"},
|
||||
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
|
||||
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
|
||||
{Path: "server-power", File: "server-power.svg"},
|
||||
{Path: "server-fans", File: "server-fans.svg"},
|
||||
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
|
||||
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
|
||||
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
|
||||
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
|
||||
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||
}
|
||||
|
||||
func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
|
||||
specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
|
||||
specs = append(specs, taskDashboardChartSpecs...)
|
||||
for _, idx := range taskGPUIndices(samples) {
|
||||
specs = append(specs, taskChartSpec{
|
||||
Path: fmt.Sprintf("gpu/%d-overview", idx),
|
||||
File: fmt.Sprintf("gpu-%d-overview.svg", idx),
|
||||
})
|
||||
}
|
||||
return specs
|
||||
}
|
||||
|
||||
func writeTaskReportArtifacts(t *Task) error {
|
||||
if t == nil {
|
||||
return nil
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
start, end := taskTimeWindow(t)
|
||||
samples, _ := loadTaskMetricSamples(start, end)
|
||||
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
|
||||
|
||||
logText := ""
|
||||
if data, err := os.ReadFile(t.LogPath); err == nil {
|
||||
logText = string(data)
|
||||
}
|
||||
|
||||
report := taskReport{
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
|
||||
Error: t.ErrMsg,
|
||||
LogFile: filepath.Base(t.LogPath),
|
||||
Charts: charts,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
}
|
||||
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
|
||||
}
|
||||
|
||||
func reportDoneTime(t *Task) time.Time {
|
||||
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
return *t.DoneAt
|
||||
}
|
||||
return time.Now()
|
||||
}
|
||||
|
||||
func taskTimeWindow(t *Task) (time.Time, time.Time) {
|
||||
if t == nil {
|
||||
now := time.Now().UTC()
|
||||
return now, now
|
||||
}
|
||||
start := t.CreatedAt.UTC()
|
||||
if t.StartedAt != nil && !t.StartedAt.IsZero() {
|
||||
start = t.StartedAt.UTC()
|
||||
}
|
||||
end := time.Now().UTC()
|
||||
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
end = t.DoneAt.UTC()
|
||||
}
|
||||
if end.Before(start) {
|
||||
end = start
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
db, err := openMetricsDB(taskReportMetricsDBPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer db.Close()
|
||||
return db.LoadBetween(start, end)
|
||||
}
|
||||
|
||||
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
|
||||
if len(samples) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
var charts []taskReportChart
|
||||
inline := make(map[string]string)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||
if !ok || len(svg) == 0 {
|
||||
continue
|
||||
}
|
||||
path := filepath.Join(dir, spec.File)
|
||||
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||
inline[spec.File] = string(svg)
|
||||
}
|
||||
return charts, inline
|
||||
}
|
||||
|
||||
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||
buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
if err != nil || !hasData {
|
||||
return "", nil, false
|
||||
}
|
||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
return "", nil, false
|
||||
}
|
||||
buf, err := renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
if err != nil {
|
||||
return "", nil, false
|
||||
}
|
||||
return title, buf, true
|
||||
}
|
||||
|
||||
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
|
||||
seen := map[int]bool{}
|
||||
var out []int
|
||||
for _, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if seen[g.GPUIndex] {
|
||||
continue
|
||||
}
|
||||
seen[g.GPUIndex] = true
|
||||
out = append(out, g.GPUIndex)
|
||||
}
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func writeJSONFile(path string, v any) error {
|
||||
data, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
|
||||
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="grid2">`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
|
||||
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
|
||||
if strings.TrimSpace(report.Error) != "" {
|
||||
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
|
||||
}
|
||||
b.WriteString(`</div></div>`)
|
||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||
b.WriteString(`</div></div></div>`)
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||
b.WriteString(charts[chart.File])
|
||||
b.WriteString(`</div></div>`)
|
||||
}
|
||||
} else {
|
||||
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||
}
|
||||
|
||||
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||
if len(runs) == 0 {
|
||||
return ""
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Benchmark Results",
|
||||
"Composite score for this benchmark task.",
|
||||
"No benchmark results were saved for this task.",
|
||||
columns,
|
||||
runs,
|
||||
)
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
if runDir == archivePath {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
func taskArchivePathFromLog(logText string) string {
|
||||
lines := strings.Split(logText, "\n")
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.HasPrefix(path, "Archive written to ") {
|
||||
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||
}
|
||||
if strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func renderTaskStatusBadge(status string) string {
|
||||
className := map[string]string{
|
||||
TaskRunning: "badge-ok",
|
||||
TaskPending: "badge-unknown",
|
||||
TaskDone: "badge-ok",
|
||||
TaskFailed: "badge-err",
|
||||
TaskCancelled: "badge-unknown",
|
||||
}[status]
|
||||
if className == "" {
|
||||
className = "badge-unknown"
|
||||
}
|
||||
label := strings.TrimSpace(status)
|
||||
if label == "" {
|
||||
label = "unknown"
|
||||
}
|
||||
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
|
||||
}
|
||||
|
||||
func formatTaskTime(ts *time.Time, fallback time.Time) string {
|
||||
if ts != nil && !ts.IsZero() {
|
||||
return ts.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
if !fallback.IsZero() {
|
||||
return fallback.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
return "n/a"
|
||||
}
|
||||
|
||||
func formatTaskDuration(sec int) string {
|
||||
if sec <= 0 {
|
||||
return "n/a"
|
||||
}
|
||||
if sec < 60 {
|
||||
return fmt.Sprintf("%ds", sec)
|
||||
}
|
||||
if sec < 3600 {
|
||||
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
|
||||
}
|
||||
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,12 +2,18 @@ package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
@@ -22,21 +28,34 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
|
||||
started := time.Now().Add(-time.Minute)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
|
||||
// A task that was pending (not yet started) must be re-queued on restart.
|
||||
pendingTask := &Task{
|
||||
ID: "task-pending",
|
||||
Name: "Memory Burn-in",
|
||||
Target: "memory-stress",
|
||||
Priority: 2,
|
||||
Status: TaskRunning,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{
|
||||
Duration: 300,
|
||||
BurnProfile: "smoke",
|
||||
},
|
||||
params: taskParams{Duration: 300, BurnProfile: "smoke"},
|
||||
}
|
||||
// A task that was running when bee-web crashed must NOT be re-queued —
|
||||
// its child processes (e.g. gpu-burn-worker) survive the restart in
|
||||
// their own process groups and can't be cancelled retroactively.
|
||||
runningTask := &Task{
|
||||
ID: "task-running",
|
||||
Name: "NVIDIA GPU Stress",
|
||||
Target: "nvidia-stress",
|
||||
Priority: 1,
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now().Add(-3 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{Duration: 86400},
|
||||
}
|
||||
for _, task := range []*Task{pendingTask, runningTask} {
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
}
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
q.persistLocked()
|
||||
|
||||
recovered := &taskQueue{
|
||||
@@ -46,18 +65,47 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
recovered.loadLocked()
|
||||
|
||||
if len(recovered.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(recovered.tasks))
|
||||
if len(recovered.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(recovered.tasks))
|
||||
}
|
||||
got := recovered.tasks[0]
|
||||
if got.Status != TaskPending {
|
||||
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||
|
||||
byID := map[string]*Task{}
|
||||
for i := range recovered.tasks {
|
||||
byID[recovered.tasks[i].ID] = recovered.tasks[i]
|
||||
}
|
||||
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("params=%+v", got.params)
|
||||
|
||||
// Pending task must be re-queued as pending with params intact.
|
||||
p := byID["task-pending"]
|
||||
if p == nil {
|
||||
t.Fatal("task-pending not found")
|
||||
}
|
||||
if got.LogPath == "" {
|
||||
t.Fatal("expected log path")
|
||||
if p.Status != TaskPending {
|
||||
t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
|
||||
}
|
||||
if p.StartedAt != nil {
|
||||
t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
|
||||
}
|
||||
if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("pending task: params=%+v", p.params)
|
||||
}
|
||||
if p.LogPath == "" {
|
||||
t.Fatal("pending task: expected log path")
|
||||
}
|
||||
|
||||
// Running task must be marked failed, not re-queued, to prevent
|
||||
// launching duplicate workers (e.g. a second set of gpu-burn-workers).
|
||||
r := byID["task-running"]
|
||||
if r == nil {
|
||||
t.Fatal("task-running not found")
|
||||
}
|
||||
if r.Status != TaskFailed {
|
||||
t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
|
||||
}
|
||||
if r.ErrMsg == "" {
|
||||
t.Fatal("running task: expected non-empty error message")
|
||||
}
|
||||
if r.DoneAt == nil {
|
||||
t.Fatal("running task: expected done_at to be set")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,15 +126,363 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
tasks: []*Task{
|
||||
{
|
||||
ID: "old-running",
|
||||
Name: "Old Running",
|
||||
Status: TaskRunning,
|
||||
Priority: 10,
|
||||
CreatedAt: now.Add(-3 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "new-done",
|
||||
Name: "New Done",
|
||||
Status: TaskDone,
|
||||
Priority: 0,
|
||||
CreatedAt: now.Add(-1 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "mid-pending",
|
||||
Name: "Mid Pending",
|
||||
Status: TaskPending,
|
||||
Priority: 1,
|
||||
CreatedAt: now.Add(-2 * time.Minute),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := q.snapshot()
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("snapshot len=%d want 3", len(got))
|
||||
}
|
||||
if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
|
||||
t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
origCounter := jobCounter.Load()
|
||||
jobCounter.Store(0)
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
jobCounter.Store(origCounter)
|
||||
})
|
||||
|
||||
if got := newJobID("ignored"); got != "TASK-000" {
|
||||
t.Fatalf("id=%q want TASK-000", got)
|
||||
}
|
||||
if got := newJobID("ignored"); got != "TASK-001" {
|
||||
t.Fatalf("id=%q want TASK-001", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
task := &Task{
|
||||
ID: "TASK-007",
|
||||
Name: "NVIDIA Benchmark",
|
||||
}
|
||||
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||
if !strings.HasPrefix(got, "007_") {
|
||||
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
logPath := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "done-1",
|
||||
Name: "Done Task",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
LogPath: logPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
|
||||
req.SetPathValue("id", "done-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
|
||||
t.Fatalf("body=%q", body)
|
||||
}
|
||||
if !strings.Contains(body, "event: done\n") {
|
||||
t.Fatalf("missing done event: %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "pending-1",
|
||||
Name: "Pending Task",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
|
||||
req.SetPathValue("id", "pending-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
|
||||
cancel()
|
||||
<-done
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
return
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
cancel()
|
||||
<-done
|
||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||
}
|
||||
|
||||
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
db, err := openMetricsDB(metricsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
base := time.Now().UTC().Add(-45 * time.Second)
|
||||
if err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base,
|
||||
CPULoadPct: 42,
|
||||
MemLoadPct: 35,
|
||||
PowerW: 510,
|
||||
}); err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
_ = db.Close()
|
||||
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
started := time.Now().UTC().Add(-90 * time.Second)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: started.Add(-10 * time.Second),
|
||||
StartedAt: &started,
|
||||
}
|
||||
q.assignTaskLogPathLocked(task)
|
||||
appendJobLog(task.LogPath, "line-1")
|
||||
|
||||
job := newTaskJobState(task.LogPath)
|
||||
job.finish("")
|
||||
q.finalizeTaskRun(task, job)
|
||||
|
||||
if task.Status != TaskDone {
|
||||
t.Fatalf("status=%q want %q", task.Status, TaskDone)
|
||||
}
|
||||
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
|
||||
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
|
||||
}
|
||||
if _, err := os.Stat(task.ReportJSONPath); err != nil {
|
||||
t.Fatalf("report json: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
|
||||
t.Fatalf("report html: %v", err)
|
||||
}
|
||||
var report taskReport
|
||||
data, err := os.ReadFile(task.ReportJSONPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile(report.json): %v", err)
|
||||
}
|
||||
if err := json.Unmarshal(data, &report); err != nil {
|
||||
t.Fatalf("Unmarshal(report.json): %v", err)
|
||||
}
|
||||
if report.ID != task.ID || report.Status != TaskDone {
|
||||
t.Fatalf("report=%+v", report)
|
||||
}
|
||||
if len(report.Charts) == 0 {
|
||||
t.Fatalf("expected charts in report, got none")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
result := platform.NvidiaBenchmarkResult{
|
||||
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||
BenchmarkProfile: "standard",
|
||||
OverallStatus: "OK",
|
||||
GPUs: []platform.BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1176.25,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-bench",
|
||||
Name: "NVIDIA Benchmark",
|
||||
Target: "nvidia-benchmark",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||
ArtifactsDir: artifactsDir,
|
||||
}
|
||||
ensureTaskReportPaths(task)
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := writeTaskReportArtifacts(task); err != nil {
|
||||
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||
}
|
||||
|
||||
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile(report.html): %v", err)
|
||||
}
|
||||
html := string(body)
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`GPU 0`,
|
||||
`1176.25`,
|
||||
} {
|
||||
if !strings.Contains(html, needle) {
|
||||
t.Fatalf("report missing %q: %s", needle, html)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||
var lines []string
|
||||
prev := taskSerialWriteLine
|
||||
taskSerialWriteLine = func(line string) { lines = append(lines, line) }
|
||||
t.Cleanup(func() { taskSerialWriteLine = prev })
|
||||
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-serial-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
}
|
||||
|
||||
q.enqueue(task)
|
||||
started := time.Now().UTC()
|
||||
task.Status = TaskRunning
|
||||
task.StartedAt = &started
|
||||
job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||
job.append("Starting CPU SAT...")
|
||||
job.append("CPU stress duration: 60s")
|
||||
job.finish("")
|
||||
q.finalizeTaskRun(task, job)
|
||||
|
||||
joined := strings.Join(lines, "\n")
|
||||
for _, needle := range []string{
|
||||
"queued",
|
||||
"Starting CPU SAT...",
|
||||
"CPU stress duration: 60s",
|
||||
"finished with status=done",
|
||||
} {
|
||||
if !strings.Contains(joined, needle) {
|
||||
t.Fatalf("serial mirror missing %q in %q", needle, joined)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
want burnPreset
|
||||
}{
|
||||
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
||||
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
||||
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
||||
{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
|
||||
{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
|
||||
{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
|
||||
{profile: "", want: burnPreset{DurationSec: 5 * 60}},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||
@@ -95,6 +491,83 @@ func TestResolveBurnPreset(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveNvidiaRampPlan(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
profile string
|
||||
enabled bool
|
||||
selected []int
|
||||
want nvidiaRampSpec
|
||||
wantErr string
|
||||
}{
|
||||
{
|
||||
name: "disabled uses base preset",
|
||||
profile: "acceptance",
|
||||
selected: []int{0, 1},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, TotalDurationSec: 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "smoke ramp uses two minute steps",
|
||||
profile: "smoke",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 5 * 60, StaggerSeconds: 2 * 60, TotalDurationSec: 9 * 60},
|
||||
},
|
||||
{
|
||||
name: "acceptance ramp uses ten minute steps",
|
||||
profile: "acceptance",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 10 * 60, TotalDurationSec: 80 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight stays at eight hours when possible",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2},
|
||||
want: nvidiaRampSpec{DurationSec: 6 * 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 8 * 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight extends to keep one hour after final gpu",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8},
|
||||
want: nvidiaRampSpec{DurationSec: 60 * 60, StaggerSeconds: 60 * 60, TotalDurationSec: 9 * 60 * 60},
|
||||
},
|
||||
{
|
||||
name: "overnight rejects impossible gpu count",
|
||||
profile: "overnight",
|
||||
enabled: true,
|
||||
selected: []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
|
||||
wantErr: "at most 10 GPUs",
|
||||
},
|
||||
{
|
||||
name: "enabled requires explicit selection",
|
||||
profile: "smoke",
|
||||
enabled: true,
|
||||
wantErr: "requires explicit GPU selection",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got, err := resolveNvidiaRampPlan(tc.profile, tc.enabled, tc.selected)
|
||||
if tc.wantErr != "" {
|
||||
if err == nil || !strings.Contains(err.Error(), tc.wantErr) {
|
||||
t.Fatalf("err=%v want substring %q", err, tc.wantErr)
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("resolveNvidiaRampPlan error: %v", err)
|
||||
}
|
||||
if got != tc.want {
|
||||
t.Fatalf("resolveNvidiaRampPlan(%q, %t, %v)=%+v want %+v", tc.profile, tc.enabled, tc.selected, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||
tests := []struct {
|
||||
loader string
|
||||
@@ -113,8 +586,6 @@ func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
blocked := make(chan struct{})
|
||||
released := make(chan struct{})
|
||||
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
@@ -173,8 +644,6 @@ func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var gotDuration int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
@@ -202,3 +671,151 @@ func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{ExportDir: dir},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "support-bundle-1",
|
||||
Name: "Support Bundle",
|
||||
Target: "support-bundle",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotExportDir string
|
||||
orig := buildSupportBundle
|
||||
buildSupportBundle = func(exportDir string) (string, error) {
|
||||
gotExportDir = exportDir
|
||||
return filepath.Join(exportDir, "bundle.tar.gz"), nil
|
||||
}
|
||||
defer func() { buildSupportBundle = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotExportDir != dir {
|
||||
t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
|
||||
t.Fatalf("lines=%v", j.lines)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||
started := time.Time{}
|
||||
task := &Task{
|
||||
Status: TaskRunning,
|
||||
CreatedAt: created,
|
||||
StartedAt: &started,
|
||||
}
|
||||
if got := taskElapsedSec(task, now); got != 0 {
|
||||
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||
}
|
||||
|
||||
stale := created.Add(-24 * time.Hour)
|
||||
task.StartedAt = &stale
|
||||
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "install-1",
|
||||
Name: "Install to Disk",
|
||||
Target: "install",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{Device: "/dev/sda"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotDevice string
|
||||
var gotLogPath string
|
||||
orig := installCommand
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
gotDevice = device
|
||||
gotLogPath = logPath
|
||||
return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
|
||||
}
|
||||
defer func() { installCommand = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDevice != "/dev/sda" {
|
||||
t.Fatalf("device=%q want /dev/sda", gotDevice)
|
||||
}
|
||||
if gotLogPath == "" {
|
||||
t.Fatal("expected install log path")
|
||||
}
|
||||
logs := strings.Join(j.lines, "\n")
|
||||
if !strings.Contains(logs, "Install log: ") {
|
||||
t.Fatalf("missing install log line: %v", j.lines)
|
||||
}
|
||||
if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
|
||||
t.Fatalf("missing streamed output: %v", j.lines)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
kmsgWatcher: newKmsgWatcher(nil),
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-panic-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
panic("boom")
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
q.executeTask(tk, j, context.Background())
|
||||
|
||||
if tk.Status != TaskFailed {
|
||||
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
|
||||
}
|
||||
if tk.DoneAt == nil {
|
||||
t.Fatal("expected done_at to be set")
|
||||
}
|
||||
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
|
||||
t.Fatalf("task error=%q", tk.ErrMsg)
|
||||
}
|
||||
if !strings.Contains(j.err, "task panic: boom") {
|
||||
t.Fatalf("job error=%q", j.err)
|
||||
}
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount := q.kmsgWatcher.activeCount
|
||||
window := q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 0 {
|
||||
t.Fatalf("activeCount=%d want 0", activeCount)
|
||||
}
|
||||
if window != nil {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
16
audit/scripts/resolve-version.sh
Executable file
16
audit/scripts/resolve-version.sh
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
|
||||
case "${tag}" in
|
||||
v*)
|
||||
printf '%s\n' "${tag#v}"
|
||||
;;
|
||||
"")
|
||||
printf 'dev\n'
|
||||
;;
|
||||
*)
|
||||
printf '%s\n' "${tag}"
|
||||
;;
|
||||
esac
|
||||
2
bible
2
bible
Submodule bible updated: 688b87e98d...1d89a4918e
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
|
||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||
There is no client-side canvas or JS chart library.
|
||||
|
||||
## Rule: live charts must be visually uniform
|
||||
|
||||
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||
changes to existing charts must keep the same rendering model and presentation
|
||||
rules unless there is an explicit architectural decision to diverge.
|
||||
|
||||
Default expectations:
|
||||
|
||||
- same server-side SVG pipeline for all live metrics charts
|
||||
- same refresh behaviour and failure handling in the browser
|
||||
- same canvas size class and card layout
|
||||
- same legend placement policy across charts
|
||||
- same axis, title, and summary conventions
|
||||
- no chart-specific visual exceptions added as a quick fix
|
||||
|
||||
Current default for live charts:
|
||||
|
||||
- legend below the plot area when a chart has 8 series or fewer
|
||||
- legend hidden when a chart has more than 8 series
|
||||
- 10 equal Y-axis steps across the chart height
|
||||
- 1400 x 360 SVG canvas with legend
|
||||
- 1400 x 288 SVG canvas without legend
|
||||
- full-width card rendering in a single-column stack
|
||||
|
||||
If one chart needs a different layout or legend behaviour, treat that as a
|
||||
design-level decision affecting the whole chart family, not as a local tweak to
|
||||
just one endpoint.
|
||||
|
||||
### Why go-analyze/charts
|
||||
|
||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
|
||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||
|
||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
||||
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||
the legend is hidden. The page renders them at `width: 100%` in a
|
||||
single-column layout so they always fill the viewport width.
|
||||
|
||||
### Ring buffers
|
||||
|
||||
@@ -60,6 +60,8 @@ Rules:
|
||||
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||
- SSH is independent from the desktop path
|
||||
- serial console support is enabled for VM boot debugging
|
||||
- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
|
||||
- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
|
||||
|
||||
## ISO build sequence
|
||||
|
||||
|
||||
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
@@ -0,0 +1,224 @@
|
||||
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Status:** resolved
|
||||
|
||||
## Context
|
||||
|
||||
We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
|
||||
The commit history shows several distinct attempts:
|
||||
|
||||
- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
|
||||
- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
|
||||
- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
|
||||
- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
|
||||
- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
|
||||
- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
|
||||
|
||||
Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
|
||||
|
||||
- `lb binary_memtest` does run and installs `memtest86+`
|
||||
- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
|
||||
- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
|
||||
|
||||
So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
|
||||
|
||||
Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
|
||||
|
||||
- the build now completes successfully because memtest is non-blocking by default
|
||||
- `lb binary_memtest` still runs and installs `memtest86+`
|
||||
- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
|
||||
- but it executes too early for its current target paths:
|
||||
- `binary/boot/grub/grub.cfg` is still missing at hook time
|
||||
- `binary/isolinux/live.cfg` is still missing at hook time
|
||||
- memtest binaries are also still absent in `binary/boot/`
|
||||
- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
|
||||
- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
|
||||
|
||||
So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
|
||||
|
||||
Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
artifact dated 2026-04-01:
|
||||
|
||||
- the final ISO does contain `boot/memtest86+x64.bin`
|
||||
- the final ISO does contain `boot/memtest86+x64.efi`
|
||||
- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
|
||||
and `isolinux/live.cfg`
|
||||
- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
|
||||
shipped ISO
|
||||
- the regression was in the build-time validator/debug path in `build.sh`
|
||||
|
||||
Root cause of the false alarm:
|
||||
|
||||
- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
|
||||
successfully listed/extracted members"
|
||||
- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
|
||||
observable output as "memtest content missing"
|
||||
- this made a reader failure look identical to a missing memtest payload
|
||||
- as a result, we re-entered the same memtest investigation loop even though
|
||||
the real ISO was already correct
|
||||
|
||||
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||
|
||||
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||
`iso_memtest_present` return code of `1` as fatal
|
||||
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||
the recovery design itself was wrong
|
||||
|
||||
## Known Failed Attempts
|
||||
|
||||
These approaches were already tried and should not be repeated blindly:
|
||||
|
||||
1. Built-in live-build memtest only.
|
||||
Reason it failed:
|
||||
- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
|
||||
|
||||
2. Fixing only the memtest file names for Debian Bookworm.
|
||||
Reason it failed:
|
||||
- correct file names alone do not make the files appear in the final ISO.
|
||||
|
||||
3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
|
||||
Reason it failed:
|
||||
- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
|
||||
|
||||
4. Fallback extraction from cached `memtest86+` `.deb`.
|
||||
Reason it failed:
|
||||
- this was explored already and was not enough to stabilize the final ISO path end-to-end.
|
||||
|
||||
5. Restoring explicit memtest menu entries in source bootloader templates only.
|
||||
Reason it failed:
|
||||
- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
|
||||
|
||||
6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
|
||||
Reason it failed:
|
||||
- the hook runs before those files exist, so the hook cannot patch them there.
|
||||
|
||||
## What This Means
|
||||
|
||||
When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
|
||||
|
||||
- do not assume the built-in memtest stage is sufficient
|
||||
- do not assume `chroot/boot/` will contain memtest payloads
|
||||
- do not assume source bootloader templates are the last writer of final ISO configs
|
||||
- do not assume the current normal binary hook timing is late enough for final patching
|
||||
|
||||
Any future memtest fix must explicitly identify:
|
||||
|
||||
- where the memtest binaries are reliably available at build time
|
||||
- which exact build stage writes the final bootloader configs that land in the ISO
|
||||
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||
the validator printed a memtest warning
|
||||
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||
context rather than accidentally tripping `set -e`
|
||||
|
||||
## Decision
|
||||
|
||||
For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
|
||||
|
||||
Project rules from now on:
|
||||
|
||||
- Do **not** trust `--memtest memtest86+` by itself.
|
||||
- A memtest implementation is considered valid only if the produced ISO actually contains:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a GRUB menu entry
|
||||
- an isolinux menu entry
|
||||
- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
|
||||
- a binary hook copying files into `binary/boot/`
|
||||
- extraction from the cached `memtest86+` `.deb`
|
||||
- another deterministic build-time copy step
|
||||
- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
|
||||
|
||||
Current implementation direction:
|
||||
|
||||
- keep the live-build memtest stage enabled if it helps package acquisition
|
||||
- do not rely on the current early `binary_hooks` timing for final patching
|
||||
- prefer a post-`lb build` recovery step in `build.sh` that:
|
||||
- patches the fully materialized `LB_DIR/binary` tree
|
||||
- injects memtest binaries there
|
||||
- ensures final bootloader entries there
|
||||
- reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
|
||||
- also treat ISO validation tooling as part of the critical path:
|
||||
- install a stable ISO reader in the builder image
|
||||
- fail with an explicit reader error if ISO listing/extraction fails
|
||||
- do not treat reader failure as evidence that memtest is missing
|
||||
- do not call a probe that may return "needs recovery" as a bare command under
|
||||
`set -e`; wrap it in explicit control flow
|
||||
|
||||
## Consequences
|
||||
|
||||
- Future memtest changes must begin by reading this ADR and the commits listed above.
|
||||
- Future memtest changes must also begin by reading the failed-attempt list above.
|
||||
- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
|
||||
- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
|
||||
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||
"missing memtest" warning without a successful ISO read is not evidence.
|
||||
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||
|
||||
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||
|
||||
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||
|
||||
### Components
|
||||
|
||||
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||
|
||||
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||
those files may not exist yet. Instead:
|
||||
|
||||
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||
If they do not exist, the hook warns and continues (does not fail).
|
||||
|
||||
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||
|
||||
**2. Post-`lb build` recovery step in `build.sh`**
|
||||
|
||||
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||
contains all required memtest artifacts. If not:
|
||||
|
||||
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||
the ISO with the patched tree.
|
||||
|
||||
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||
|
||||
**3. ISO validation hardening**
|
||||
|
||||
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||
handled — it does not abort the build prematurely.
|
||||
|
||||
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||
|
||||
### Why this works when earlier attempts did not
|
||||
|
||||
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||
|
||||
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||
There is no ordering dependency to get wrong.
|
||||
|
||||
### Do not revert
|
||||
|
||||
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||
live-build alone produces all four required artifacts:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- memtest entry in `boot/grub/grub.cfg`
|
||||
- memtest entry in `isolinux/live.cfg`
|
||||
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
| Date | Decision | Status |
|
||||
|---|---|---|
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
|
||||
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
@@ -0,0 +1,248 @@
|
||||
# Benchmark clock calibration research
|
||||
|
||||
## Status
|
||||
In progress. Baseline data from production servers pending.
|
||||
|
||||
## Background
|
||||
|
||||
The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
|
||||
before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
|
||||
`avg_steady_clock < locked_target * 0.90`.
|
||||
|
||||
Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
|
||||
even a healthy GPU in a non-ideal server will sustain clocks well below boost.
|
||||
The 90% threshold has no empirical basis.
|
||||
|
||||
## Key observations (2026-04-06)
|
||||
|
||||
### H100 PCIe — new card, server not designed for it
|
||||
- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
|
||||
- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
|
||||
- Stability: 70.0 — clocks erratic, no equilibrium found
|
||||
- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
|
||||
|
||||
### H200 NVL — new card, server not designed for it
|
||||
- avg clock = P95 = 1635 MHz (perfectly stable)
|
||||
- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
|
||||
- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
|
||||
- Degradation: power_capped, thermal_limited
|
||||
- Compute: 989 TOPS — card is computing correctly for its frequency
|
||||
|
||||
### Key insight
|
||||
The meaningful distinction is not *whether* the card throttles but *how stably*
|
||||
it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
|
||||
H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
|
||||
instability may reflect a more severe thermal mismatch or a card issue.
|
||||
|
||||
`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
|
||||
`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
|
||||
|
||||
## Hypothesis for baseline
|
||||
|
||||
After testing on servers designed for their GPUs (proper cooling):
|
||||
- Healthy GPU under sustained load will run at a stable fraction of boost
|
||||
- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
|
||||
- Base clock (`clocks.base.gr`) may be a better reference than boost:
|
||||
a healthy card under real workload should comfortably exceed base clock
|
||||
|
||||
## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
|
||||
|
||||
Source: external stress test tool, ~90s runs, designed server, adequate power.
|
||||
|
||||
### Healthy fingerprint
|
||||
|
||||
- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
|
||||
- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
|
||||
- Avg steady (visual): **~1580–1620 MHz**
|
||||
- vs boost 1755 MHz: **~91–92%**
|
||||
- Oscillation is NORMAL — this is the boost algorithm balancing under power cap
|
||||
- Stable power + oscillating clocks = healthy power-cap behavior
|
||||
- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
|
||||
- **Consistency**: all 10 samples within ±20 MHz — very repeatable
|
||||
|
||||
### Characteristic patten
|
||||
Flat power line + oscillating/declining clock line = GPU correctly managed by
|
||||
power cap algorithm. Do NOT flag this as instability.
|
||||
|
||||
### Clock CV implication
|
||||
The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
|
||||
The current `variance_too_high` threshold (StabilityScore < 85) may fire on
|
||||
healthy HBM2e PCIe cards. Needs recalibration.
|
||||
|
||||
---
|
||||
|
||||
## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
|
||||
|
||||
Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
|
||||
Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
|
||||
|
||||
### GPU clock reference (from nvidia-smi, idle):
|
||||
- base_clock_mhz: **1095**
|
||||
- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
|
||||
- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
|
||||
- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
|
||||
|
||||
### Observed under 700W sustained load (both samples nearly identical):
|
||||
- Power: ~700W flat — SXM slot, adequate power confirmed
|
||||
- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
|
||||
- vs 1980 MHz (lock target): **72–74%** — severely below
|
||||
- vs 1755 MHz (nvidia-smi boost): **81–83%**
|
||||
- vs 1095 MHz (base): 130% — above base but far below expected for SXM
|
||||
- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
|
||||
- Temperature: 38°C → 79–80°C (same rate as HBM2e)
|
||||
- Oscillation: present, similar character to HBM2e but at much lower frequency
|
||||
|
||||
### Diagnosis
|
||||
These restored cards are degraded. A healthy H100 SXM in a designed server
|
||||
(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
|
||||
The 72–74% result is a clear signal of silicon or VRM degradation from the
|
||||
refurbishment process.
|
||||
|
||||
### Clock pattern note
|
||||
Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
|
||||
to images 19/20. Both sample sets show same degraded pattern — same batch.
|
||||
|
||||
---
|
||||
|
||||
## Baseline matrix (filled where data available)
|
||||
|
||||
| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
|
||||
|---|---|---|---|---|---|
|
||||
| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
|
||||
| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
|
||||
| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
|
||||
| H200 NVL | designed | TBD | TBD | TBD | need baseline |
|
||||
|
||||
---
|
||||
|
||||
## H100 official spec (from NVIDIA datasheet)
|
||||
|
||||
Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
|
||||
All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
|
||||
|
||||
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||
|---|---|---|---|---|---|
|
||||
| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
|
||||
| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
|
||||
| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
|
||||
| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
|
||||
|
||||
Notes:
|
||||
- SXM boards do NOT list FP8 peak in this table (field empty)
|
||||
- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
|
||||
- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
|
||||
|
||||
## Observed efficiency (H100 80GB PCIe, throttled server)
|
||||
|
||||
From the report in this session (power+thermal throttle throughout steady):
|
||||
|
||||
| Precision | Measured | Spec (dense) | % of spec |
|
||||
|---|---|---|---|
|
||||
| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
|
||||
| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
|
||||
| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
|
||||
|
||||
33–44% of spec is expected given sustained power+thermal throttle (avg clock
|
||||
1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
|
||||
actual frequency — the low TOPS comes from throttle, not silicon defect.
|
||||
|
||||
## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
|
||||
|
||||
Format: without sparsity / with sparsity.
|
||||
|
||||
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||
|---|---|---|---|---|---|
|
||||
| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
|
||||
| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
|
||||
|
||||
## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
|
||||
|
||||
Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
|
||||
|
||||
| Precision | Measured | Spec (dense) | % of spec |
|
||||
|---|---|---|---|
|
||||
| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
|
||||
| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
|
||||
| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
|
||||
|
||||
Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
|
||||
both are throttle-limited. Confirms that % of spec is not a quality signal,
|
||||
it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
|
||||
|
||||
## Real-world GEMM efficiency reference (2026-04-06, web research)
|
||||
|
||||
Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
|
||||
worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
|
||||
|
||||
### What healthy systems actually achieve:
|
||||
- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
|
||||
- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
|
||||
- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
|
||||
|
||||
### Our results vs expectation:
|
||||
| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
|
||||
|---|---|---|---|---|
|
||||
| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
|
||||
| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
|
||||
|
||||
Our results are roughly **half** of what a healthy system achieves even under throttle.
|
||||
This is NOT normal — 30-44% is not the industry baseline.
|
||||
|
||||
### Likely causes of the gap (in order of probability):
|
||||
1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
|
||||
2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
|
||||
Previous user may have set a lower limit via nvidia-smi -pl and it was not
|
||||
reset. Our normalization sets clock locks but does NOT reset power limit.
|
||||
Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
|
||||
3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
|
||||
8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
|
||||
|
||||
### Power limit gap analysis (H100 PCIe):
|
||||
- Avg clock 1384 MHz = 79% of boost 1755 MHz
|
||||
- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
|
||||
- Actually measured: 329 TOPS = 55% of that estimate
|
||||
- Remaining gap after accounting for clock throttle: ~45%
|
||||
- Most likely explanation: enforced power limit < 350W TDP, further reducing
|
||||
sustainable clock beyond what sw_thermal alone would cause.
|
||||
|
||||
### Action item:
|
||||
Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
|
||||
so result.json shows if the card was pre-configured with a non-default limit.
|
||||
If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
|
||||
|
||||
### CPU/RAM impact on GPU FLOPS:
|
||||
None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
|
||||
CPU core count and host RAM are irrelevant.
|
||||
|
||||
## Compute efficiency metric (proposed, no hardcode)
|
||||
|
||||
Instead of comparing TOPS to a hardcoded spec, compute:
|
||||
tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
|
||||
|
||||
This is model-agnostic. A GPU computing correctly at its actual frequency
|
||||
will show a consistent tops_per_sm_per_ghz regardless of throttle level.
|
||||
A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
|
||||
normal clocks.
|
||||
|
||||
SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
|
||||
(needs to be added to queryBenchmarkGPUInfo).
|
||||
|
||||
Reference values to establish after baseline runs:
|
||||
- H100 PCIe fp16_tensor: TBD tops/SM/GHz
|
||||
- H100 SXM fp16_tensor: TBD tops/SM/GHz
|
||||
|
||||
## Proposed threshold changes (pending more data)
|
||||
|
||||
1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
|
||||
91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
|
||||
capture the root cause.
|
||||
|
||||
2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
|
||||
under power cap. Consider suppressing this flag when power is flat and usage
|
||||
is 100% (oscillation is expected). Or lower threshold to 70.
|
||||
|
||||
3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
|
||||
ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
|
||||
would have been caught by this).
|
||||
|
||||
Decision deferred until baseline on SXM designed servers collected.
|
||||
117
bible-local/docs/gpu-model-propagation.md
Normal file
117
bible-local/docs/gpu-model-propagation.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# GPU Model Name Propagation
|
||||
|
||||
How GPU model names are detected, stored, and displayed throughout the project.
|
||||
|
||||
---
|
||||
|
||||
## Detection Sources
|
||||
|
||||
There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
|
||||
|
||||
### Pipeline A — Live / SAT (nvidia-smi query at runtime)
|
||||
|
||||
**File:** `audit/internal/platform/sat.go`
|
||||
|
||||
- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
|
||||
- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
|
||||
- Used by: GPU selection UI, live metrics labels, burn/stress test logic
|
||||
|
||||
### Pipeline B — Benchmark results
|
||||
|
||||
**File:** `audit/internal/platform/benchmark.go`, line 124
|
||||
|
||||
- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
|
||||
- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
|
||||
- Used by: benchmark history table, benchmark report
|
||||
|
||||
### Pipeline C — Hardware audit JSON (PCIe schema)
|
||||
|
||||
**File:** `audit/internal/schema/hardware.go`
|
||||
|
||||
- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
|
||||
- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
|
||||
- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
|
||||
- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
|
||||
|
||||
---
|
||||
|
||||
## Key Inconsistency: NVIDIA PCIe Model is Never Set
|
||||
|
||||
`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
|
||||
|
||||
This means:
|
||||
- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
|
||||
- AMD GPUs do have their model populated
|
||||
|
||||
The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
|
||||
|
||||
---
|
||||
|
||||
## Benchmark History "Unknown GPU" Issue
|
||||
|
||||
**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
|
||||
|
||||
**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
|
||||
|
||||
This happens for:
|
||||
- Older result files saved before the `Name` field was added
|
||||
- Runs where nvidia-smi query failed before the benchmark started
|
||||
|
||||
---
|
||||
|
||||
## Fallback Strings — Current State
|
||||
|
||||
| Location | File | Fallback string |
|
||||
|---|---|---|
|
||||
| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
|
||||
| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
|
||||
| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
|
||||
| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
|
||||
| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
|
||||
| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
|
||||
| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
|
||||
| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
|
||||
|
||||
**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
|
||||
|
||||
---
|
||||
|
||||
## GPU Selection UI
|
||||
|
||||
**File:** `audit/internal/webui/pages.go`
|
||||
|
||||
- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
|
||||
- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
|
||||
- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
|
||||
|
||||
This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
|
||||
|
||||
---
|
||||
|
||||
## Data Flow Summary
|
||||
|
||||
```
|
||||
nvidia-smi (live)
|
||||
└─ ListNvidiaGPUs() → NvidiaGPU.Name
|
||||
├─ GPU selection UI (always correct)
|
||||
├─ Live metrics labels (charts_svg.go)
|
||||
└─ SAT/burn status file (sat.go)
|
||||
|
||||
nvidia-smi (at benchmark start)
|
||||
└─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
|
||||
└─ BenchmarkGPUResult.Name (json:"name,omitempty")
|
||||
├─ Benchmark report
|
||||
└─ Benchmark history table columns
|
||||
|
||||
nvidia-smi / lspci (audit collection)
|
||||
└─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
|
||||
└─ Hardware summary page hwDescribeGPU()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What Needs Fixing
|
||||
|
||||
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
|
||||
2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
|
||||
3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
|
||||
@@ -17,6 +17,46 @@ This applies to:
|
||||
|
||||
## Memtest rule
|
||||
|
||||
Prefer live-build's built-in memtest integration over custom hooks or hardcoded
|
||||
bootloader paths. If you ever need to reference memtest files manually, verify
|
||||
the exact package file list first for the target Debian release.
|
||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||
We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
|
||||
ran, but the final ISO still lacked memtest binaries and menu entries.
|
||||
|
||||
For this project, memtest is accepted only when the produced ISO actually
|
||||
contains all of the following:
|
||||
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a memtest entry in `boot/grub/grub.cfg`
|
||||
- a memtest entry in `isolinux/live.cfg`
|
||||
|
||||
Rules:
|
||||
|
||||
- Keep explicit post-build memtest validation in `build.sh`.
|
||||
- Treat ISO reader success as a separate prerequisite from memtest content.
|
||||
If the reader cannot list or extract from the ISO, that is a validator
|
||||
failure, not proof that memtest is missing.
|
||||
- If built-in integration does not produce the artifacts above, use a
|
||||
deterministic project-owned copy/extract step instead of hoping live-build
|
||||
will "start working".
|
||||
- Do not switch back to built-in-only memtest without fresh build evidence from
|
||||
a real ISO.
|
||||
- If you reference memtest files manually, verify the exact package file list
|
||||
first for the target Debian release.
|
||||
|
||||
Known bad loops for this repository:
|
||||
|
||||
- Do not retry built-in-only memtest without new evidence. We already proved
|
||||
that `lb binary_memtest` can run while the final ISO still has no memtest.
|
||||
- Do not assume fixing memtest file names is enough. Correct names did not fix
|
||||
the final artifact path.
|
||||
- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
|
||||
- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
|
||||
bootloader configs.
|
||||
- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
|
||||
timing is late enough to patch final `binary/boot/grub/grub.cfg` or
|
||||
`binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
|
||||
present yet when the hook executed.
|
||||
- Do not treat a validator warning as ground truth until you have confirmed the
|
||||
ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
|
||||
regression because the final ISO was correct but the validator produced a
|
||||
false negative.
|
||||
|
||||
@@ -17,6 +17,7 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
wget \
|
||||
curl \
|
||||
tar \
|
||||
libarchive-tools \
|
||||
xz-utils \
|
||||
rsync \
|
||||
build-essential \
|
||||
|
||||
@@ -6,9 +6,9 @@ NCCL_CUDA_VERSION=13.0
|
||||
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
|
||||
NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUBLAS_VERSION=13.1.1.3-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.2-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
|
||||
@@ -32,7 +32,7 @@ lb config noauto \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -33,10 +33,8 @@ typedef void *CUstream;
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||
#define MAX_STRESS_STREAMS 16
|
||||
#define MAX_CUBLAS_PROFILES 5
|
||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||
#define STRESS_LAUNCH_DEPTH 8
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
@@ -344,7 +342,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
unsigned long iterations = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int launches_per_wave = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||
@@ -419,44 +416,42 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
|
||||
unsigned int threads = 256;
|
||||
|
||||
double start = now_seconds();
|
||||
double deadline = start + (double)seconds;
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
launches_per_wave = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel,
|
||||
blocks,
|
||||
1,
|
||||
1,
|
||||
threads,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
streams[lane],
|
||||
params[lane],
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launches_per_wave++;
|
||||
launched_this_batch++;
|
||||
}
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
int launched = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel,
|
||||
blocks,
|
||||
1,
|
||||
1,
|
||||
threads,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
streams[lane],
|
||||
params[lane],
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launched++;
|
||||
iterations++;
|
||||
}
|
||||
if (launches_per_wave <= 0) {
|
||||
if (launched <= 0) {
|
||||
goto fail;
|
||||
}
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
}
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
iterations += (unsigned long)launches_per_wave;
|
||||
}
|
||||
api->cuCtxSynchronize();
|
||||
|
||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||
goto fail;
|
||||
@@ -468,11 +463,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
report->iterations = iterations;
|
||||
snprintf(report->details,
|
||||
sizeof(report->details),
|
||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
bytes_per_stream[0] / (1024u * 1024u),
|
||||
iterations);
|
||||
|
||||
@@ -606,6 +600,20 @@ struct prepared_profile {
|
||||
};
|
||||
|
||||
static const struct profile_desc k_profiles[] = {
|
||||
{
|
||||
"fp64",
|
||||
"fp64",
|
||||
80,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
8,
|
||||
CUDA_R_64F,
|
||||
CUDA_R_64F,
|
||||
CUDA_R_64F,
|
||||
CUDA_R_64F,
|
||||
CUBLAS_COMPUTE_64F,
|
||||
},
|
||||
{
|
||||
"fp32_tf32",
|
||||
"fp32",
|
||||
@@ -680,6 +688,8 @@ static const struct profile_desc k_profiles[] = {
|
||||
#endif
|
||||
};
|
||||
|
||||
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
|
||||
|
||||
static int load_cublaslt(struct cublaslt_api *api) {
|
||||
memset(api, 0, sizeof(*api));
|
||||
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
|
||||
@@ -1112,9 +1122,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int cc_minor,
|
||||
int seconds,
|
||||
int size_mb,
|
||||
const char *precision_filter,
|
||||
struct stress_report *report) {
|
||||
struct cublaslt_api cublas;
|
||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
|
||||
cublasLtHandle_t handle = NULL;
|
||||
CUcontext ctx = NULL;
|
||||
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||
@@ -1124,9 +1135,8 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int active = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||
int profile_count = PROFILE_COUNT;
|
||||
int prepared_count = 0;
|
||||
int wave_launches = 0;
|
||||
size_t requested_budget = 0;
|
||||
size_t total_budget = 0;
|
||||
size_t per_profile_budget = 0;
|
||||
@@ -1150,8 +1160,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count profiles matching the filter (for deciding what to run). */
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
|
||||
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
|
||||
planned++;
|
||||
}
|
||||
}
|
||||
@@ -1162,18 +1174,31 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Count all profiles active on this GPU regardless of filter.
|
||||
* Used as the budget divisor so matrix sizes stay consistent whether
|
||||
* running all precisions together or a single-precision phase. */
|
||||
int planned_total = 0;
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
|
||||
planned_total++;
|
||||
}
|
||||
}
|
||||
if (planned_total < planned) {
|
||||
planned_total = planned;
|
||||
}
|
||||
|
||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||
cuda->cuStreamCreate &&
|
||||
cuda->cuStreamDestroy) {
|
||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||
stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
|
||||
}
|
||||
if (stream_count > 1) {
|
||||
int created = 0;
|
||||
@@ -1186,18 +1211,17 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
}
|
||||
report->stream_count = stream_count;
|
||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||
per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
|
||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
mp_count,
|
||||
per_profile_budget / (1024u * 1024u));
|
||||
|
||||
@@ -1211,6 +1235,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
desc->min_cc);
|
||||
continue;
|
||||
}
|
||||
if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=SKIPPED precision_filter\n",
|
||||
desc->name);
|
||||
continue;
|
||||
}
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
CUstream stream = streams[lane];
|
||||
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||
@@ -1246,50 +1277,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Keep the GPU queue continuously full by submitting kernels without
|
||||
* synchronizing after every wave. A sync barrier after each small batch
|
||||
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||
* especially when individual kernels are short. Instead we sync at most
|
||||
* once per second (for error detection) and once at the very end. */
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
wave_launches = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (int j = 0; j < prepared_count; j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
wave_launches++;
|
||||
launched_this_batch++;
|
||||
int launched = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (int j = 0; j < prepared_count; j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
launched++;
|
||||
}
|
||||
if (wave_launches <= 0) {
|
||||
if (launched <= 0) {
|
||||
break;
|
||||
}
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
}
|
||||
/* Final drain — ensure all queued work finishes before we read results. */
|
||||
cuda->cuCtxSynchronize();
|
||||
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
@@ -1327,6 +1363,7 @@ int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
int device_index = 0;
|
||||
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
@@ -1334,8 +1371,12 @@ int main(int argc, char **argv) {
|
||||
size_mb = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||
device_index = atoi(argv[++i]);
|
||||
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
||||
precision_filter = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||
fprintf(stderr,
|
||||
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
|
||||
argv[0]);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
@@ -1395,10 +1436,20 @@ int main(int argc, char **argv) {
|
||||
int ok = 0;
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
|
||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||
#endif
|
||||
if (!ok) {
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
|
||||
if (precision_filter != NULL) {
|
||||
fprintf(stderr,
|
||||
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
|
||||
precision_filter,
|
||||
name,
|
||||
cc_major,
|
||||
cc_minor);
|
||||
return 1;
|
||||
}
|
||||
int ptx_mb = size_mb;
|
||||
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
nvidia|nvidia-legacy|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
@@ -180,6 +185,9 @@ case "$VARIANT" in
|
||||
nvidia)
|
||||
run_variant nvidia
|
||||
;;
|
||||
nvidia-legacy)
|
||||
run_variant nvidia-legacy
|
||||
;;
|
||||
amd)
|
||||
run_variant amd
|
||||
;;
|
||||
@@ -188,6 +196,7 @@ case "$VARIANT" in
|
||||
;;
|
||||
all)
|
||||
run_variant nvidia
|
||||
run_variant nvidia-legacy
|
||||
run_variant amd
|
||||
run_variant nogpu
|
||||
;;
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
#!/bin/sh
|
||||
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
||||
# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
|
||||
#
|
||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
||||
# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
|
||||
# - open -> kernel-open/ sources from the .run installer
|
||||
# - proprietary -> traditional proprietary kernel sources from the .run installer
|
||||
#
|
||||
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
||||
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
||||
@@ -17,10 +19,19 @@ set -e
|
||||
NVIDIA_VERSION="$1"
|
||||
DIST_DIR="$2"
|
||||
DEBIAN_KERNEL_ABI="$3"
|
||||
NVIDIA_FLAVOR="${4:-open}"
|
||||
|
||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
|
||||
case "$NVIDIA_FLAVOR" in
|
||||
open|proprietary) ;;
|
||||
*)
|
||||
echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
# On Debian, kernel headers are split into two packages:
|
||||
@@ -31,7 +42,22 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||
|
||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
||||
echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
CACHE_LAYOUT_VERSION="3"
|
||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||
echo "=== installing linux-headers-${KVER} ==="
|
||||
@@ -42,18 +68,6 @@ fi
|
||||
echo "kernel headers (arch): $KDIR_ARCH"
|
||||
echo "kernel headers (common): $KDIR_COMMON"
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l) .ko files"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Download official NVIDIA .run installer with sha256 verification
|
||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||
@@ -87,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
||||
rm -rf "$EXTRACT_DIR"
|
||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||
|
||||
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
||||
# Find kernel source directory for the selected flavor.
|
||||
KERNEL_SRC=""
|
||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||
if [ "$NVIDIA_FLAVOR" = "open" ]; then
|
||||
for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
else
|
||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
fi
|
||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||
echo "kernel source: $KERNEL_SRC"
|
||||
|
||||
# Build kernel modules
|
||||
@@ -130,24 +150,30 @@ else
|
||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||
fi
|
||||
|
||||
# Copy ALL userspace library files.
|
||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||
# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
|
||||
# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
|
||||
# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
|
||||
# but still fail with "no OpenCL platforms" because one dependent .so is absent.
|
||||
copied_libs=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
|
||||
cp "$f" "$CACHE_DIR/lib/"
|
||||
copied_libs=$((copied_libs+1))
|
||||
done
|
||||
|
||||
if [ "$copied_libs" -eq 0 ]; then
|
||||
echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
done
|
||||
if [ "$count" -eq 0 ]; then
|
||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
||||
libnvidia-opencl; do
|
||||
if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
|
||||
echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
|
||||
ls "$CACHE_DIR/lib/" | sort >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
@@ -156,23 +182,17 @@ done
|
||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl \
|
||||
libnvidia-compiler \
|
||||
libnvidia-nvvm \
|
||||
libnvidia-fatbinaryloader; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
# Create soname symlinks for every copied versioned library.
|
||||
for versioned in "$CACHE_DIR"/lib/*.so.*; do
|
||||
[ -f "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
||||
echo "${lib}: .so.1 -> $base"
|
||||
stem=${base%%.so.*}
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
|
||||
ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
|
||||
done
|
||||
|
||||
touch "$CACHE_LAYOUT_MARKER"
|
||||
|
||||
echo "=== NVIDIA build complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $ko_count .ko files"
|
||||
|
||||
@@ -15,29 +15,50 @@ DIST_DIR="${REPO_ROOT}/dist"
|
||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
AUTH_KEYS=""
|
||||
BUILD_VARIANT="nvidia"
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||
|
||||
# parse args
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
||||
--variant) BUILD_VARIANT="$2"; shift 2 ;;
|
||||
*) echo "unknown arg: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$BEE_GPU_VENDOR" in
|
||||
nvidia|amd|nogpu) ;;
|
||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
||||
case "$BUILD_VARIANT" in
|
||||
nvidia)
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||
;;
|
||||
nvidia-legacy)
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
BEE_NVIDIA_MODULE_FLAVOR="proprietary"
|
||||
;;
|
||||
amd)
|
||||
BEE_GPU_VENDOR="amd"
|
||||
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||
;;
|
||||
nogpu)
|
||||
BEE_GPU_VENDOR="nogpu"
|
||||
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||
;;
|
||||
*)
|
||||
echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||
|
||||
export BEE_GPU_VENDOR
|
||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
# Allow git to read the bind-mounted repo (different UID inside container).
|
||||
git config --global safe.directory "${REPO_ROOT}"
|
||||
@@ -53,15 +74,8 @@ resolve_audit_version() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
if [ -z "${tag}" ]; then
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
fi
|
||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||
case "${tag}" in
|
||||
audit/v*)
|
||||
echo "${tag#audit/v}"
|
||||
return 0
|
||||
;;
|
||||
v*)
|
||||
echo "${tag#v}"
|
||||
return 0
|
||||
@@ -144,6 +158,25 @@ iso_extract_file() {
|
||||
return 127
|
||||
}
|
||||
|
||||
iso_read_file_list() {
|
||||
iso_path="$1"
|
||||
out_path="$2"
|
||||
|
||||
iso_list_files "$iso_path" > "$out_path" || return 1
|
||||
[ -s "$out_path" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
iso_read_member() {
|
||||
iso_path="$1"
|
||||
iso_member="$2"
|
||||
out_path="$3"
|
||||
|
||||
iso_extract_file "$iso_path" "$iso_member" > "$out_path" || return 1
|
||||
[ -s "$out_path" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
require_iso_reader() {
|
||||
command -v bsdtar >/dev/null 2>&1 && return 0
|
||||
command -v xorriso >/dev/null 2>&1 && return 0
|
||||
@@ -177,6 +210,16 @@ dump_memtest_debug() {
|
||||
fi
|
||||
done
|
||||
|
||||
echo "-- source binary hooks --"
|
||||
for hook in \
|
||||
"${BUILDER_DIR}/config/hooks/normal/9100-memtest.hook.binary"; do
|
||||
if [ -f "$hook" ]; then
|
||||
echo " hook: $hook"
|
||||
else
|
||||
echo " (missing $hook)"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
|
||||
echo "-- live-build workdir package lists --"
|
||||
for pkg in \
|
||||
@@ -203,6 +246,20 @@ dump_memtest_debug() {
|
||||
echo " (missing $lb_dir/binary/boot)"
|
||||
fi
|
||||
|
||||
echo "-- live-build binary grub cfg --"
|
||||
if [ -f "$lb_dir/binary/boot/grub/grub.cfg" ]; then
|
||||
grep -n 'Memory Test\|memtest' "$lb_dir/binary/boot/grub/grub.cfg" || echo " (no memtest lines)"
|
||||
else
|
||||
echo " (missing $lb_dir/binary/boot/grub/grub.cfg)"
|
||||
fi
|
||||
|
||||
echo "-- live-build binary isolinux cfg --"
|
||||
if [ -f "$lb_dir/binary/isolinux/live.cfg" ]; then
|
||||
grep -n 'Memory Test\|memtest' "$lb_dir/binary/isolinux/live.cfg" || echo " (no memtest lines)"
|
||||
else
|
||||
echo " (missing $lb_dir/binary/isolinux/live.cfg)"
|
||||
fi
|
||||
|
||||
echo "-- live-build package cache --"
|
||||
if [ -d "$lb_dir/cache/packages.chroot" ]; then
|
||||
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
|
||||
@@ -212,14 +269,32 @@ dump_memtest_debug() {
|
||||
fi
|
||||
|
||||
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
|
||||
iso_files="$(mktemp)"
|
||||
iso_grub_cfg="$(mktemp)"
|
||||
iso_isolinux_cfg="$(mktemp)"
|
||||
|
||||
echo "-- ISO memtest files --"
|
||||
iso_list_files "$iso_path" | grep 'memtest' | sed 's/^/ /' || echo " (no memtest files in ISO)"
|
||||
if iso_read_file_list "$iso_path" "$iso_files"; then
|
||||
grep 'memtest' "$iso_files" | sed 's/^/ /' || echo " (no memtest files in ISO)"
|
||||
else
|
||||
echo " (failed to list ISO contents)"
|
||||
fi
|
||||
|
||||
echo "-- ISO GRUB memtest lines --"
|
||||
iso_extract_file "$iso_path" boot/grub/grub.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in boot/grub/grub.cfg)"
|
||||
if iso_read_member "$iso_path" boot/grub/grub.cfg "$iso_grub_cfg"; then
|
||||
grep -n 'Memory Test\|memtest' "$iso_grub_cfg" || echo " (no memtest lines in boot/grub/grub.cfg)"
|
||||
else
|
||||
echo " (failed to read boot/grub/grub.cfg from ISO)"
|
||||
fi
|
||||
|
||||
echo "-- ISO isolinux memtest lines --"
|
||||
iso_extract_file "$iso_path" isolinux/live.cfg 2>/dev/null | grep -n 'Memory Test\|memtest' || echo " (no memtest lines in isolinux/live.cfg)"
|
||||
if iso_read_member "$iso_path" isolinux/live.cfg "$iso_isolinux_cfg"; then
|
||||
grep -n 'Memory Test\|memtest' "$iso_isolinux_cfg" || echo " (no memtest lines in isolinux/live.cfg)"
|
||||
else
|
||||
echo " (failed to read isolinux/live.cfg from ISO)"
|
||||
fi
|
||||
|
||||
rm -f "$iso_files" "$iso_grub_cfg" "$iso_isolinux_cfg"
|
||||
fi
|
||||
|
||||
echo "=== end memtest debug: ${phase} ==="
|
||||
@@ -235,63 +310,360 @@ dump_memtest_debug() {
|
||||
memtest_fail() {
|
||||
msg="$1"
|
||||
iso_path="${2:-}"
|
||||
echo "ERROR: ${msg}" >&2
|
||||
level="WARNING"
|
||||
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||
level="ERROR"
|
||||
fi
|
||||
echo "${level}: ${msg}" >&2
|
||||
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
|
||||
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||
exit 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
nvidia_runtime_fail() {
|
||||
msg="$1"
|
||||
echo "ERROR: ${msg}" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
iso_memtest_present() {
|
||||
iso_path="$1"
|
||||
iso_files="$(mktemp)"
|
||||
|
||||
[ -f "$iso_path" ] || return 1
|
||||
|
||||
if command -v bsdtar >/dev/null 2>&1; then
|
||||
:
|
||||
elif command -v xorriso >/dev/null 2>&1; then
|
||||
:
|
||||
else
|
||||
return 2
|
||||
fi
|
||||
|
||||
iso_read_file_list "$iso_path" "$iso_files" || {
|
||||
rm -f "$iso_files"
|
||||
return 2
|
||||
}
|
||||
|
||||
grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
|
||||
rm -f "$iso_files"
|
||||
return 1
|
||||
}
|
||||
grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
|
||||
rm -f "$iso_files"
|
||||
return 1
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 2
|
||||
}
|
||||
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 2
|
||||
}
|
||||
|
||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
|
||||
validate_iso_memtest() {
|
||||
iso_path="$1"
|
||||
echo "=== validating memtest in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
|
||||
require_iso_reader "$iso_path"
|
||||
|
||||
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.bin$' || {
|
||||
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
|
||||
[ -f "$iso_path" ] || {
|
||||
memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
|
||||
return 0
|
||||
}
|
||||
iso_list_files "$iso_path" | grep -q '^boot/memtest86+x64\.efi$' || {
|
||||
require_iso_reader "$iso_path" || return 0
|
||||
|
||||
iso_files="$(mktemp)"
|
||||
iso_read_file_list "$iso_path" "$iso_files" || {
|
||||
memtest_fail "failed to list ISO contents while validating memtest" "$iso_path"
|
||||
rm -f "$iso_files"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
|
||||
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
|
||||
rm -f "$iso_files"
|
||||
return 0
|
||||
}
|
||||
grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
|
||||
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
|
||||
rm -f "$iso_files"
|
||||
return 0
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_extract_file "$iso_path" boot/grub/grub.cfg > "$grub_cfg" || memtest_fail "failed to extract boot/grub/grub.cfg from ISO" "$iso_path"
|
||||
iso_extract_file "$iso_path" isolinux/live.cfg > "$isolinux_cfg" || memtest_fail "failed to extract isolinux/live.cfg from ISO" "$iso_path"
|
||||
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||
memtest_fail "failed to read boot/grub/grub.cfg from ISO" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||
memtest_fail "failed to read isolinux/live.cfg from ISO" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
||||
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
||||
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
||||
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
||||
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
||||
memtest_fail "isolinux memtest path is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
|
||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
echo "=== memtest validation OK ==="
|
||||
}
|
||||
|
||||
validate_iso_nvidia_runtime() {
|
||||
iso_path="$1"
|
||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||
|
||||
echo "=== validating NVIDIA runtime in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
|
||||
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
|
||||
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
|
||||
|
||||
squashfs_tmp="$(mktemp)"
|
||||
squashfs_list="$(mktemp)"
|
||||
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
|
||||
}
|
||||
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
|
||||
}
|
||||
|
||||
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
||||
}
|
||||
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
||||
}
|
||||
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
||||
}
|
||||
|
||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||
echo "=== NVIDIA runtime validation OK ==="
|
||||
}
|
||||
|
||||
append_memtest_grub_entry() {
|
||||
grub_cfg="$1"
|
||||
[ -f "$grub_cfg" ] || return 1
|
||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" && return 0
|
||||
grep -q '### BEE MEMTEST ###' "$grub_cfg" && return 0
|
||||
|
||||
cat >> "$grub_cfg" <<'EOF'
|
||||
|
||||
### BEE MEMTEST ###
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
### /BEE MEMTEST ###
|
||||
EOF
|
||||
}
|
||||
|
||||
append_memtest_isolinux_entry() {
|
||||
isolinux_cfg="$1"
|
||||
[ -f "$isolinux_cfg" ] || return 1
|
||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" && return 0
|
||||
grep -q '### BEE MEMTEST ###' "$isolinux_cfg" && return 0
|
||||
|
||||
cat >> "$isolinux_cfg" <<'EOF'
|
||||
|
||||
# ### BEE MEMTEST ###
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
# ### /BEE MEMTEST ###
|
||||
EOF
|
||||
}
|
||||
|
||||
copy_memtest_from_deb() {
|
||||
deb="$1"
|
||||
dst_boot="$2"
|
||||
tmpdir="$(mktemp -d)"
|
||||
|
||||
dpkg-deb -x "$deb" "$tmpdir"
|
||||
for f in memtest86+x64.bin memtest86+x64.efi; do
|
||||
if [ -f "$tmpdir/boot/$f" ]; then
|
||||
cp "$tmpdir/boot/$f" "$dst_boot/$f"
|
||||
fi
|
||||
done
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
|
||||
reset_live_build_stage() {
|
||||
lb_dir="$1"
|
||||
stage="$2"
|
||||
|
||||
for root in \
|
||||
"$lb_dir/.build" \
|
||||
"$lb_dir/.stage" \
|
||||
"$lb_dir/auto"; do
|
||||
[ -d "$root" ] || continue
|
||||
find "$root" -maxdepth 1 \( -name "${stage}" -o -name "${stage}.*" -o -name "*${stage}*" \) -exec rm -rf {} + 2>/dev/null || true
|
||||
done
|
||||
}
|
||||
|
||||
recover_iso_memtest() {
|
||||
lb_dir="$1"
|
||||
iso_path="$2"
|
||||
binary_boot="$lb_dir/binary/boot"
|
||||
grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
|
||||
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||
|
||||
echo "=== attempting memtest recovery in binary tree ==="
|
||||
|
||||
mkdir -p "$binary_boot"
|
||||
|
||||
for root in \
|
||||
"$lb_dir/chroot/boot" \
|
||||
"/boot"; do
|
||||
for f in memtest86+x64.bin memtest86+x64.efi; do
|
||||
if [ ! -f "$binary_boot/$f" ] && [ -f "$root/$f" ]; then
|
||||
cp "$root/$f" "$binary_boot/$f"
|
||||
echo "memtest recovery: copied $f from $root"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
|
||||
for dir in \
|
||||
"$lb_dir/cache/packages.binary" \
|
||||
"$lb_dir/cache/packages.chroot" \
|
||||
"$lb_dir/chroot/var/cache/apt/archives" \
|
||||
"${BEE_CACHE_DIR:-${DIST_DIR}/cache}/lb-packages" \
|
||||
"/var/cache/apt/archives"; do
|
||||
[ -d "$dir" ] || continue
|
||||
deb="$(find "$dir" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||
[ -n "$deb" ] || continue
|
||||
echo "memtest recovery: extracting payload from $deb"
|
||||
copy_memtest_from_deb "$deb" "$binary_boot"
|
||||
break
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
|
||||
tmpdl="$(mktemp -d)"
|
||||
if (
|
||||
cd "$tmpdl" && apt-get download memtest86+ >/dev/null 2>&1
|
||||
); then
|
||||
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||
if [ -n "$deb" ]; then
|
||||
echo "memtest recovery: downloaded $deb"
|
||||
copy_memtest_from_deb "$deb" "$binary_boot"
|
||||
fi
|
||||
fi
|
||||
rm -rf "$tmpdl"
|
||||
fi
|
||||
|
||||
if [ -f "$grub_cfg" ]; then
|
||||
append_memtest_grub_entry "$grub_cfg" && echo "memtest recovery: ensured GRUB entry"
|
||||
else
|
||||
echo "memtest recovery: WARNING: missing $grub_cfg"
|
||||
fi
|
||||
|
||||
if [ -f "$isolinux_cfg" ]; then
|
||||
append_memtest_isolinux_entry "$isolinux_cfg" && echo "memtest recovery: ensured isolinux entry"
|
||||
else
|
||||
echo "memtest recovery: WARNING: missing $isolinux_cfg"
|
||||
fi
|
||||
|
||||
reset_live_build_stage "$lb_dir" "binary_checksums"
|
||||
reset_live_build_stage "$lb_dir" "binary_iso"
|
||||
reset_live_build_stage "$lb_dir" "binary_zsync"
|
||||
|
||||
run_optional_step_sh "rebuild live-build checksums after memtest recovery" "91-lb-checksums" "lb binary_checksums 2>&1"
|
||||
run_optional_step_sh "rebuild ISO after memtest recovery" "92-lb-binary-iso" "rm -f '$iso_path' && lb binary_iso 2>&1"
|
||||
run_optional_step_sh "rebuild zsync after memtest recovery" "93-lb-zsync" "lb binary_zsync 2>&1"
|
||||
|
||||
if [ ! -f "$iso_path" ]; then
|
||||
memtest_fail "ISO rebuild was skipped or failed after memtest recovery: $iso_path" "$iso_path"
|
||||
fi
|
||||
}
|
||||
|
||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
LOG_DIR="${DIST_DIR}/${ISO_BASENAME}.logs"
|
||||
LOG_ARCHIVE="${DIST_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||
ISO_OUT="${DIST_DIR}/${ISO_BASENAME}.iso"
|
||||
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||
mkdir -p "${OUT_DIR}"
|
||||
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
||||
LOG_ARCHIVE="${OUT_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||
ISO_OUT="${OUT_DIR}/${ISO_BASENAME}.iso"
|
||||
LOG_OUT="${LOG_DIR}/build.log"
|
||||
|
||||
cleanup_build_log() {
|
||||
status="${1:-$?}"
|
||||
trap - EXIT INT TERM HUP
|
||||
|
||||
if [ "${STEP_LOG_ACTIVE:-0}" = "1" ]; then
|
||||
cleanup_step_log "${status}" || true
|
||||
fi
|
||||
|
||||
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
|
||||
BUILD_LOG_ACTIVE=0
|
||||
exec 1>&3 2>&4
|
||||
@@ -304,7 +676,8 @@ cleanup_build_log() {
|
||||
|
||||
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
|
||||
rm -f "${LOG_ARCHIVE}"
|
||||
tar -czf "${LOG_ARCHIVE}" -C "${DIST_DIR}" "$(basename "${LOG_DIR}")" 2>/dev/null || true
|
||||
tar -czf "${LOG_ARCHIVE}" -C "$(dirname "${LOG_DIR}")" "$(basename "${LOG_DIR}")" 2>/dev/null || true
|
||||
rm -rf "${LOG_DIR}"
|
||||
fi
|
||||
|
||||
exit "${status}"
|
||||
@@ -335,6 +708,89 @@ start_build_log() {
|
||||
echo "=== build log archive: ${LOG_ARCHIVE} ==="
|
||||
}
|
||||
|
||||
cleanup_step_log() {
|
||||
status="${1:-$?}"
|
||||
|
||||
if [ "${STEP_LOG_ACTIVE:-0}" = "1" ]; then
|
||||
STEP_LOG_ACTIVE=0
|
||||
exec 1>&5 2>&6
|
||||
exec 5>&- 6>&-
|
||||
if [ -n "${STEP_TEE_PID:-}" ]; then
|
||||
wait "${STEP_TEE_PID}" 2>/dev/null || true
|
||||
fi
|
||||
rm -f "${STEP_LOG_PIPE}"
|
||||
fi
|
||||
|
||||
return "${status}"
|
||||
}
|
||||
|
||||
run_step() {
|
||||
step_name="$1"
|
||||
step_slug="$2"
|
||||
shift 2
|
||||
|
||||
step_log="${LOG_DIR}/${step_slug}.log"
|
||||
echo ""
|
||||
echo "=== step: ${step_name} ==="
|
||||
echo "=== step log: ${step_log} ==="
|
||||
|
||||
STEP_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-step-log.XXXXXX")"
|
||||
mkfifo "${STEP_LOG_PIPE}"
|
||||
|
||||
exec 5>&1 6>&2
|
||||
tee "${step_log}" < "${STEP_LOG_PIPE}" >&5 &
|
||||
STEP_TEE_PID=$!
|
||||
exec > "${STEP_LOG_PIPE}" 2>&1
|
||||
STEP_LOG_ACTIVE=1
|
||||
|
||||
set +e
|
||||
"$@"
|
||||
step_status=$?
|
||||
set -e
|
||||
|
||||
cleanup_step_log "${step_status}"
|
||||
if [ "${step_status}" -ne 0 ]; then
|
||||
echo "ERROR: step failed: ${step_name} (see ${step_log})" >&2
|
||||
exit "${step_status}"
|
||||
fi
|
||||
|
||||
echo "=== step OK: ${step_name} ==="
|
||||
}
|
||||
|
||||
run_step_sh() {
|
||||
step_name="$1"
|
||||
step_slug="$2"
|
||||
step_script="$3"
|
||||
|
||||
run_step "${step_name}" "${step_slug}" sh -c "${step_script}"
|
||||
}
|
||||
|
||||
run_optional_step_sh() {
|
||||
step_name="$1"
|
||||
step_slug="$2"
|
||||
step_script="$3"
|
||||
|
||||
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||
run_step_sh "${step_name}" "${step_slug}" "${step_script}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
step_log="${LOG_DIR}/${step_slug}.log"
|
||||
echo ""
|
||||
echo "=== optional step: ${step_name} ==="
|
||||
echo "=== optional step log: ${step_log} ==="
|
||||
set +e
|
||||
sh -c "${step_script}" > "${step_log}" 2>&1
|
||||
step_status=$?
|
||||
set -e
|
||||
cat "${step_log}"
|
||||
if [ "${step_status}" -ne 0 ]; then
|
||||
echo "WARNING: optional step failed: ${step_name} (see ${step_log})" >&2
|
||||
else
|
||||
echo "=== optional step OK: ${step_name} ==="
|
||||
fi
|
||||
}
|
||||
|
||||
start_build_log
|
||||
|
||||
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
|
||||
@@ -365,13 +821,13 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
||||
apt-get install -y "linux-headers-${KVER}"
|
||||
fi
|
||||
|
||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
|
||||
echo "=== syncing git submodules ==="
|
||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||
run_step "sync git submodules" "05-git-submodules" \
|
||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||
|
||||
# --- compile bee binary (static, Linux amd64) ---
|
||||
# Shared between variants — built once, reused on second pass.
|
||||
@@ -383,13 +839,13 @@ if [ -f "$BEE_BIN" ]; then
|
||||
fi
|
||||
|
||||
if [ "$NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee binary ==="
|
||||
cd "${REPO_ROOT}/audit"
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||
go build \
|
||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
|
||||
-o "$BEE_BIN" \
|
||||
./cmd/bee
|
||||
run_step_sh "build bee binary" "10-build-bee" \
|
||||
"cd '${REPO_ROOT}/audit' && \
|
||||
env GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||
go build \
|
||||
-ldflags '-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}' \
|
||||
-o '${BEE_BIN}' \
|
||||
./cmd/bee"
|
||||
echo "binary: $BEE_BIN"
|
||||
if command -v stat >/dev/null 2>&1; then
|
||||
BEE_SIZE_BYTES="$(stat -c '%s' "$BEE_BIN" 2>/dev/null || stat -f '%z' "$BEE_BIN")"
|
||||
@@ -408,9 +864,8 @@ fi
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
@@ -418,14 +873,42 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
if [ -n "$fp4_type_match" ]; then
|
||||
echo "fp4_header_symbol=present"
|
||||
echo "$fp4_type_match"
|
||||
else
|
||||
echo "fp4_header_symbol=missing"
|
||||
fi
|
||||
if [ -n "$fp4_scale_match" ]; then
|
||||
echo "fp4_scale_mode_symbol=present"
|
||||
echo "$fp4_scale_match"
|
||||
else
|
||||
echo "fp4_scale_mode_symbol=missing"
|
||||
fi
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
for dep in \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
"${BUILDER_DIR}/VERSIONS"; do
|
||||
if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
|
||||
find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-burn worker ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
run_step "build bee-gpu-burn worker" "21-gpu-burn-worker" \
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_BURN_WORKER_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
@@ -434,9 +917,15 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
else
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
echo "=== bee-gpu-burn compiled profile probe ==="
|
||||
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
|
||||
echo "fp4_profile_string=present"
|
||||
else
|
||||
echo "fp4_profile_string=missing"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
@@ -462,13 +951,92 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||
fi
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
|
||||
cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
|
||||
source /boot/grub/config.cfg
|
||||
|
||||
echo ""
|
||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
}
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "UEFI Firmware Settings" {
|
||||
fwsetup
|
||||
}
|
||||
fi
|
||||
EOF
|
||||
|
||||
cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
|
||||
label live-@FLAVOUR@-normal
|
||||
menu label ^EASY-BEE
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^graphics/KMS)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
EOF
|
||||
fi
|
||||
|
||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||
rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
@@ -546,12 +1114,11 @@ done
|
||||
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
@@ -576,9 +1143,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
fi
|
||||
|
||||
# --- build / download NCCL ---
|
||||
echo ""
|
||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
@@ -591,9 +1157,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
run_step "build nccl-tests ${NCCL_TESTS_VERSION}" "60-nccl-tests" \
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
@@ -607,9 +1172,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
echo ""
|
||||
echo "=== building john jumbo ${JOHN_JUMBO_COMMIT} ==="
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||
@@ -625,13 +1189,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
|
||||
NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||
@@ -643,6 +1208,7 @@ fi
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
@@ -653,6 +1219,11 @@ EOF
|
||||
|
||||
# Write GPU vendor marker for hooks
|
||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||
else
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||
fi
|
||||
|
||||
# Patch motd with build info
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||
@@ -723,17 +1294,17 @@ fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||
|
||||
# Export for auto/config
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
lb clean 2>&1 | tail -3
|
||||
lb config 2>&1 | tail -5
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
lb build 2>&1
|
||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
# This allows the second variant to reuse all downloaded packages.
|
||||
@@ -746,10 +1317,22 @@ fi
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
if [ -f "$ISO_RAW" ]; then
|
||||
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
||||
if iso_memtest_present "$ISO_RAW"; then
|
||||
:
|
||||
else
|
||||
memtest_status=$?
|
||||
if [ "$memtest_status" -eq 1 ]; then
|
||||
recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
|
||||
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
|
||||
elif [ "$memtest_status" -eq 2 ]; then
|
||||
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
|
||||
fi
|
||||
fi
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== done (${BUILD_VARIANT}) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
if command -v stat >/dev/null 2>&1; then
|
||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||
|
||||
@@ -7,26 +7,29 @@ echo " █████╗ ███████║███████╗ ╚
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (load to RAM)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (fail-safe)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
}
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
set color_normal=light-gray/black
|
||||
set color_highlight=white/dark-gray
|
||||
set color_highlight=yellow/black
|
||||
|
||||
if [ -e /boot/grub/splash.png ]; then
|
||||
set theme=/boot/grub/live-theme/theme.txt
|
||||
else
|
||||
set menu_color_normal=cyan/black
|
||||
set menu_color_highlight=white/dark-gray
|
||||
set menu_color_normal=yellow/black
|
||||
set menu_color_highlight=white/brown
|
||||
fi
|
||||
|
||||
@@ -3,19 +3,31 @@ label live-@FLAVOUR@-normal
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^graphics/KMS)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
|
||||
@@ -25,11 +25,14 @@ ensure_bee_console_user() {
|
||||
ensure_bee_console_user
|
||||
|
||||
# Enable common bee services
|
||||
systemctl enable bee-hpc-tuning.service
|
||||
systemctl enable bee-network.service
|
||||
systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
systemctl enable bee-sshsetup.service
|
||||
systemctl enable bee-selfheal.timer
|
||||
systemctl enable bee-boot-status.service
|
||||
systemctl enable ssh.service
|
||||
systemctl enable lightdm.service 2>/dev/null || true
|
||||
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||
@@ -53,11 +56,14 @@ fi
|
||||
# nogpu: no GPU services needed
|
||||
|
||||
# Ensure scripts are executable
|
||||
chmod +x /usr/local/bin/bee-hpc-tuning 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
|
||||
117
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
117
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
@@ -0,0 +1,117 @@
|
||||
#!/bin/sh
|
||||
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||
set -e
|
||||
echo "=== generating bee wallpaper ==="
|
||||
mkdir -p /usr/share/bee
|
||||
|
||||
python3 - <<'PYEOF'
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||
import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
ASCII_ART = [
|
||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||
]
|
||||
SUBTITLE = " Hardware Audit LiveCD"
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
MONO_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||
]
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||
]
|
||||
|
||||
|
||||
def load_font(candidates, size):
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def mono_metrics(font):
|
||||
probe = Image.new('L', (W, H), 0)
|
||||
draw = ImageDraw.Draw(probe)
|
||||
char_w = int(round(draw.textlength("M", font=font)))
|
||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||
char_h = bb[3] - bb[1]
|
||||
return char_w, char_h
|
||||
|
||||
|
||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||
width = max(len(line) for line in lines) * char_w
|
||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||
mask = Image.new('L', (width, height), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
for row, line in enumerate(lines):
|
||||
y = row * (char_h + line_gap)
|
||||
for col, ch in enumerate(line):
|
||||
if ch == ' ':
|
||||
continue
|
||||
x = col * char_w
|
||||
draw.text((x, y), ch, font=font, fill=255)
|
||||
return mask
|
||||
|
||||
|
||||
img = Image.new('RGB', (W, H), BG)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Soft amber glow under the logo without depending on font rendering.
|
||||
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||
glow_draw = ImageDraw.Draw(glow)
|
||||
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
TARGET_LOGO_W = 400
|
||||
max_chars = max(len(line) for line in ASCII_ART)
|
||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||
_probe_cw, _ = mono_metrics(_probe_font)
|
||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||
char_w, char_h = mono_metrics(font_logo)
|
||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 380
|
||||
|
||||
sh_off = max(1, font_size_logo // 6)
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 48
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
img = img.convert('RGB')
|
||||
|
||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||
PYEOF
|
||||
|
||||
echo "=== wallpaper done ==="
|
||||
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
@@ -0,0 +1,41 @@
|
||||
#!/bin/sh
|
||||
# 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
|
||||
#
|
||||
# live-boot tries "losetup --replace --direct-io=on" when re-associating the
|
||||
# loop device to the RAM copy in /dev/shm. tmpfs does not support O_DIRECT,
|
||||
# so the ioctl returns EINVAL and the verification step fails.
|
||||
#
|
||||
# The patch replaces the replace call so that if --direct-io=on fails it falls
|
||||
# back to a plain replace without direct-io, and also relaxes the verification
|
||||
# to a warning so the boot continues even when re-association is imperfect.
|
||||
set -e
|
||||
|
||||
TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
|
||||
|
||||
if [ ! -f "${TORAM_SCRIPT}" ]; then
|
||||
echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
|
||||
|
||||
# Replace any losetup --replace call that includes --direct-io=on with a
|
||||
# version that first tries with direct-io, then retries without it.
|
||||
#
|
||||
# The sed expression turns:
|
||||
# losetup --replace ... --direct-io=on LOOP FILE
|
||||
# into a shell snippet that tries both, silently.
|
||||
#
|
||||
# We also downgrade the fatal "Task finished with error." block to a warning
|
||||
# so the boot continues if re-association fails (squashfs still accessible).
|
||||
|
||||
# 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
|
||||
sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||
sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||
|
||||
# 2. Turn the hard error into a warning so boot continues.
|
||||
# live-boot prints this exact string when verification fails.
|
||||
sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
|
||||
|
||||
echo "9010-fix-toram: patch applied"
|
||||
grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
|
||||
139
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
139
iso/builder/config/hooks/normal/9100-memtest.hook.binary
Executable file
@@ -0,0 +1,139 @@
|
||||
#!/bin/sh
|
||||
# Ensure memtest is present in the final ISO even if live-build's built-in
|
||||
# memtest stage does not copy the binaries or expose menu entries.
|
||||
set -e
|
||||
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||
BINARY_BOOT_DIR="binary/boot"
|
||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||
ISOLINUX_CFG="binary/isolinux/live.cfg"
|
||||
|
||||
log() {
|
||||
echo "memtest hook: $*"
|
||||
}
|
||||
|
||||
fail_or_warn() {
|
||||
msg="$1"
|
||||
if [ "${BEE_REQUIRE_MEMTEST}" = "1" ]; then
|
||||
log "ERROR: ${msg}"
|
||||
exit 1
|
||||
fi
|
||||
log "WARNING: ${msg}"
|
||||
return 0
|
||||
}
|
||||
|
||||
copy_memtest_file() {
|
||||
src="$1"
|
||||
base="$(basename "$src")"
|
||||
dst="${BINARY_BOOT_DIR}/${base}"
|
||||
|
||||
[ -f "$src" ] || return 1
|
||||
mkdir -p "${BINARY_BOOT_DIR}"
|
||||
cp "$src" "$dst"
|
||||
log "copied ${base} from ${src}"
|
||||
}
|
||||
|
||||
extract_memtest_from_deb() {
|
||||
deb="$1"
|
||||
tmpdir="$(mktemp -d)"
|
||||
|
||||
log "extracting memtest payload from ${deb}"
|
||||
dpkg-deb -x "$deb" "$tmpdir"
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
||||
fi
|
||||
done
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
|
||||
ensure_memtest_binaries() {
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
for root in chroot/boot /boot; do
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||
done
|
||||
done
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||
[ -d "$root" ] || continue
|
||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||
[ -n "$deb" ] || continue
|
||||
extract_memtest_from_deb "$deb"
|
||||
break
|
||||
done
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||
fail_or_warn "missing ${BINARY_BOOT_DIR}/${f}"
|
||||
missing=1
|
||||
fi
|
||||
done
|
||||
[ "$missing" -eq 0 ] || return 0
|
||||
}
|
||||
|
||||
ensure_grub_entry() {
|
||||
[ -f "$GRUB_CFG" ] || {
|
||||
fail_or_warn "missing ${GRUB_CFG}"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q '### BEE MEMTEST ###' "$GRUB_CFG" && return 0
|
||||
|
||||
cat >> "$GRUB_CFG" <<'EOF'
|
||||
|
||||
### BEE MEMTEST ###
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
### /BEE MEMTEST ###
|
||||
EOF
|
||||
|
||||
log "appended memtest entry to ${GRUB_CFG}"
|
||||
}
|
||||
|
||||
ensure_isolinux_entry() {
|
||||
[ -f "$ISOLINUX_CFG" ] || {
|
||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q '### BEE MEMTEST ###' "$ISOLINUX_CFG" && return 0
|
||||
|
||||
cat >> "$ISOLINUX_CFG" <<'EOF'
|
||||
|
||||
# ### BEE MEMTEST ###
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
# ### /BEE MEMTEST ###
|
||||
EOF
|
||||
|
||||
log "appended memtest entry to ${ISOLINUX_CFG}"
|
||||
}
|
||||
|
||||
log "ensuring memtest binaries and menu entries in binary image"
|
||||
ensure_memtest_binaries
|
||||
ensure_grub_entry
|
||||
ensure_isolinux_entry
|
||||
log "memtest assets ready"
|
||||
@@ -1,3 +1,6 @@
|
||||
# AMD GPU firmware
|
||||
firmware-amd-graphics
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
||||
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
||||
# NVIDIA DCGM (Data Center GPU Manager).
|
||||
# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
|
||||
# NVIDIA max-compute recipe. The smoketest/runtime contract treats
|
||||
# dcgmproftester as required in the LiveCD.
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -60,9 +60,15 @@ qrencode
|
||||
# Local desktop (openbox + chromium kiosk)
|
||||
openbox
|
||||
tint2
|
||||
feh
|
||||
python3-pil
|
||||
xorg
|
||||
xterm
|
||||
chromium
|
||||
mousepad
|
||||
pcmanfm
|
||||
ristretto
|
||||
mupdf
|
||||
xserver-xorg-video-fbdev
|
||||
xserver-xorg-video-vesa
|
||||
lightdm
|
||||
@@ -71,9 +77,7 @@ lightdm
|
||||
firmware-linux-free
|
||||
firmware-linux-nonfree
|
||||
firmware-misc-nonfree
|
||||
firmware-amd-graphics
|
||||
firmware-realtek
|
||||
firmware-intel-sound
|
||||
firmware-bnx2
|
||||
firmware-bnx2x
|
||||
firmware-cavium
|
||||
|
||||
@@ -27,6 +27,7 @@ echo ""
|
||||
KVER=$(uname -r)
|
||||
info "kernel: $KVER"
|
||||
NVIDIA_BOOT_MODE="normal"
|
||||
NVIDIA_MODULES_FLAVOR="proprietary"
|
||||
for arg in $(cat /proc/cmdline 2>/dev/null); do
|
||||
case "$arg" in
|
||||
bee.nvidia.mode=*)
|
||||
@@ -34,7 +35,11 @@ for arg in $(cat /proc/cmdline 2>/dev/null); do
|
||||
;;
|
||||
esac
|
||||
done
|
||||
if [ -f /etc/bee-nvidia-modules-flavor ]; then
|
||||
NVIDIA_MODULES_FLAVOR="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null || echo proprietary)"
|
||||
fi
|
||||
info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
||||
info "nvidia modules flavor: ${NVIDIA_MODULES_FLAVOR}"
|
||||
|
||||
# --- PATH & binaries ---
|
||||
echo "-- PATH & binaries --"
|
||||
@@ -52,6 +57,45 @@ else
|
||||
fail "nvidia-smi: NOT FOUND"
|
||||
fi
|
||||
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
|
||||
ok "dcgmi found: $p"
|
||||
else
|
||||
fail "dcgmi: NOT FOUND"
|
||||
fi
|
||||
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
|
||||
ok "nv-hostengine found: $p"
|
||||
else
|
||||
fail "nv-hostengine: NOT FOUND"
|
||||
fi
|
||||
|
||||
DCGM_PROFTESTER=""
|
||||
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
DCGM_PROFTESTER="$p"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [ -n "$DCGM_PROFTESTER" ]; then
|
||||
ok "dcgmproftester found: $DCGM_PROFTESTER"
|
||||
else
|
||||
fail "dcgmproftester: NOT FOUND"
|
||||
fi
|
||||
|
||||
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
fail "$tool: NOT FOUND"
|
||||
fi
|
||||
done
|
||||
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
|
||||
ok "nvbandwidth found: $p"
|
||||
else
|
||||
warn "nvbandwidth: NOT FOUND"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- NVIDIA modules --"
|
||||
KO_DIR="/usr/local/lib/nvidia"
|
||||
@@ -71,10 +115,12 @@ fi
|
||||
for mod in nvidia_modeset nvidia_uvm; do
|
||||
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
|
||||
ok "module loaded: $mod"
|
||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
||||
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
|
||||
fail "module NOT loaded in normal mode: $mod"
|
||||
else
|
||||
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
|
||||
warn "module not loaded in GSP-off mode: $mod"
|
||||
else
|
||||
fail "module NOT loaded: $mod"
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -90,10 +136,12 @@ done
|
||||
|
||||
if [ -e /dev/nvidia-uvm ]; then
|
||||
ok "/dev/nvidia-uvm exists"
|
||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
||||
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
|
||||
fail "/dev/nvidia-uvm missing in normal mode"
|
||||
else
|
||||
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
|
||||
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
|
||||
else
|
||||
fail "/dev/nvidia-uvm missing"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
@@ -109,6 +157,40 @@ else
|
||||
fail "nvidia-smi: not found in PATH"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- OpenCL / John --"
|
||||
if [ -f /etc/OpenCL/vendors/nvidia.icd ]; then
|
||||
ok "OpenCL ICD present: /etc/OpenCL/vendors/nvidia.icd"
|
||||
else
|
||||
fail "OpenCL ICD missing: /etc/OpenCL/vendors/nvidia.icd"
|
||||
fi
|
||||
|
||||
if ldconfig -p 2>/dev/null | grep -q "libnvidia-opencl.so.1"; then
|
||||
ok "libnvidia-opencl.so.1 present in linker cache"
|
||||
else
|
||||
fail "libnvidia-opencl.so.1 missing from linker cache"
|
||||
fi
|
||||
|
||||
if command -v clinfo >/dev/null 2>&1; then
|
||||
if clinfo -l 2>/dev/null | grep -q "Platform"; then
|
||||
ok "clinfo: OpenCL platform detected"
|
||||
else
|
||||
fail "clinfo: no OpenCL platform detected"
|
||||
fi
|
||||
else
|
||||
fail "clinfo: not found in PATH"
|
||||
fi
|
||||
|
||||
if command -v john >/dev/null 2>&1; then
|
||||
if john --list=opencl-devices 2>/dev/null | grep -q "Device #"; then
|
||||
ok "john: OpenCL devices detected"
|
||||
else
|
||||
fail "john: no OpenCL devices detected"
|
||||
fi
|
||||
else
|
||||
fail "john: not found in PATH"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- lib symlinks --"
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
@@ -129,6 +211,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
|
||||
fi
|
||||
done
|
||||
|
||||
if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
|
||||
ok "timer active: bee-selfheal.timer"
|
||||
else
|
||||
fail "timer NOT active: bee-selfheal.timer"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- runtime health --"
|
||||
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
[Unit]
|
||||
Description=Bee: on-demand hardware audit (not started automatically)
|
||||
Description=Bee: hardware audit
|
||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
RemainAfterExit=yes
|
||||
ExecStart=/bin/sh -c 'curl -sf -X POST http://localhost/api/audit/run >/dev/null'
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /usr/local/bin/bee audit --runtime auto --output file:/appdata/bee/export/bee-audit.json
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Bee: boot status display
|
||||
After=systemd-user-sessions.service
|
||||
Before=getty@tty1.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
RemainAfterExit=no
|
||||
ExecStart=/usr/local/bin/bee-boot-status
|
||||
TTYPath=/dev/tty1
|
||||
StandardInput=tty
|
||||
StandardOutput=tty
|
||||
StandardError=tty
|
||||
TTYReset=yes
|
||||
TTYVHangup=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=Bee: HPC tuning (CPU governor, C-states)
|
||||
After=local-fs.target
|
||||
Before=bee-nvidia.service bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-hpc-tuning.log /usr/local/bin/bee-hpc-tuning
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
9
iso/overlay/etc/systemd/system/bee-selfheal.service
Normal file
9
iso/overlay/etc/systemd/system/bee-selfheal.service
Normal file
@@ -0,0 +1,9 @@
|
||||
[Unit]
|
||||
Description=Bee: periodic runtime self-heal
|
||||
After=bee-web.service bee-audit.service bee-preflight.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
11
iso/overlay/etc/systemd/system/bee-selfheal.timer
Normal file
11
iso/overlay/etc/systemd/system/bee-selfheal.timer
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Bee: run self-heal checks periodically
|
||||
|
||||
[Timer]
|
||||
OnBootSec=45sec
|
||||
OnUnitActiveSec=60sec
|
||||
AccuracySec=15sec
|
||||
Unit=bee-selfheal.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
@@ -1,11 +1,12 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit web viewer
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
|
||||
Restart=always
|
||||
RestartSec=2
|
||||
RestartSec=3
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
LimitMEMLOCK=infinity
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
[Unit]
|
||||
After=bee-boot-status.service
|
||||
@@ -0,0 +1,4 @@
|
||||
[Unit]
|
||||
|
||||
[Service]
|
||||
ExecStartPre=/usr/local/bin/bee-display-mode
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user