Compare commits
56 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
93cfa78e8c | ||
|
|
1358485f2b | ||
| 8fe20ba678 | |||
| d973231f37 | |||
| f5d175f488 | |||
| fa00667750 | |||
|
|
c7d2816a7f | ||
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 | |||
| 5b9015451e | |||
| d1a6863ceb | |||
| f9aa05de8e | |||
| a9ccea8cca | |||
| fc5c985fb5 | |||
| 5eb3baddb4 | |||
| a6ac13b5d3 | |||
| 4003cb7676 | |||
| 2875313ba0 | |||
| f1621efee4 | |||
| 4461249cc3 | |||
| e609fbbc26 | |||
| cc2b49ea41 | |||
| 33e0a5bef2 | |||
| 38e79143eb | |||
| 25af2df23a | |||
| 20abff7f90 | |||
| a14ec8631c | |||
| f58c7e58d3 | |||
| bf47c8dbd2 | |||
| 143b7dca5d | |||
| 9826d437a5 | |||
|
|
f3c14cd893 | ||
|
|
728270dc8e | ||
|
|
8692f825bc | ||
|
|
11f52ac710 | ||
|
|
1cb398fe83 | ||
|
|
7a843be6b0 | ||
|
|
7f6386dccc | ||
|
|
eea2591bcc | ||
|
|
295a19b93a | ||
|
|
444a7d16cc | ||
|
|
fd722692a4 |
@@ -1,7 +1,10 @@
|
|||||||
LISTEN ?= :8080
|
LISTEN ?= :8080
|
||||||
AUDIT_PATH ?=
|
AUDIT_PATH ?=
|
||||||
|
EXPORT_DIR ?= $(CURDIR)/.tmp/export
|
||||||
|
VERSION ?= $(shell sh ./scripts/resolve-version.sh)
|
||||||
|
GO_LDFLAGS := -X main.Version=$(VERSION)
|
||||||
|
|
||||||
RUN_ARGS := web --listen $(LISTEN)
|
RUN_ARGS := web --listen $(LISTEN) --export-dir $(EXPORT_DIR)
|
||||||
ifneq ($(AUDIT_PATH),)
|
ifneq ($(AUDIT_PATH),)
|
||||||
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
RUN_ARGS += --audit-path $(AUDIT_PATH)
|
||||||
endif
|
endif
|
||||||
@@ -9,10 +12,11 @@ endif
|
|||||||
.PHONY: run build test
|
.PHONY: run build test
|
||||||
|
|
||||||
run:
|
run:
|
||||||
go run ./cmd/bee $(RUN_ARGS)
|
mkdir -p $(EXPORT_DIR)
|
||||||
|
go run -ldflags "$(GO_LDFLAGS)" ./cmd/bee $(RUN_ARGS)
|
||||||
|
|
||||||
build:
|
build:
|
||||||
go build -o bee ./cmd/bee
|
go build -ldflags "$(GO_LDFLAGS)" -o bee ./cmd/bee
|
||||||
|
|
||||||
test:
|
test:
|
||||||
go test ./...
|
go test ./...
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -21,30 +22,7 @@ var Version = "dev"
|
|||||||
func buildLabel() string {
|
func buildLabel() string {
|
||||||
label := strings.TrimSpace(Version)
|
label := strings.TrimSpace(Version)
|
||||||
if label == "" {
|
if label == "" {
|
||||||
label = "dev"
|
return "dev"
|
||||||
}
|
|
||||||
if info, ok := debug.ReadBuildInfo(); ok {
|
|
||||||
var revision string
|
|
||||||
var modified bool
|
|
||||||
for _, setting := range info.Settings {
|
|
||||||
switch setting.Key {
|
|
||||||
case "vcs.revision":
|
|
||||||
revision = setting.Value
|
|
||||||
case "vcs.modified":
|
|
||||||
modified = setting.Value == "true"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if revision != "" {
|
|
||||||
short := revision
|
|
||||||
if len(short) > 12 {
|
|
||||||
short = short[:12]
|
|
||||||
}
|
|
||||||
label += " (" + short
|
|
||||||
if modified {
|
|
||||||
label += "+"
|
|
||||||
}
|
|
||||||
label += ")"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return label
|
return label
|
||||||
}
|
}
|
||||||
@@ -53,10 +31,19 @@ func main() {
|
|||||||
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
||||||
}
|
}
|
||||||
|
|
||||||
func run(args []string, stdout, stderr io.Writer) int {
|
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||||
Level: slog.LevelInfo,
|
Level: slog.LevelInfo,
|
||||||
})))
|
})))
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
slog.Error("fatal panic",
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
exitCode = 1
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
if len(args) == 0 {
|
if len(args) == 0 {
|
||||||
printRootUsage(stderr)
|
printRootUsage(stderr)
|
||||||
@@ -82,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runWeb(args[1:], stdout, stderr)
|
return runWeb(args[1:], stdout, stderr)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT(args[1:], stdout, stderr)
|
return runSAT(args[1:], stdout, stderr)
|
||||||
|
case "benchmark":
|
||||||
|
return runBenchmark(args[1:], stdout, stderr)
|
||||||
case "version", "--version", "-version":
|
case "version", "--version", "-version":
|
||||||
fmt.Fprintln(stdout, Version)
|
fmt.Fprintln(stdout, Version)
|
||||||
return 0
|
return 0
|
||||||
@@ -98,8 +87,9 @@ func printRootUsage(w io.Writer) {
|
|||||||
bee preflight --output stdout|file:<path>
|
bee preflight --output stdout|file:<path>
|
||||||
bee export --target <device>
|
bee export --target <device>
|
||||||
bee support-bundle --output stdout|file:<path>
|
bee support-bundle --output stdout|file:<path>
|
||||||
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||||
|
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||||
bee version
|
bee version
|
||||||
bee help [command]`)
|
bee help [command]`)
|
||||||
}
|
}
|
||||||
@@ -118,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runWeb([]string{"--help"}, stdout, stdout)
|
return runWeb([]string{"--help"}, stdout, stdout)
|
||||||
case "sat":
|
case "sat":
|
||||||
return runSAT([]string{"--help"}, stdout, stderr)
|
return runSAT([]string{"--help"}, stdout, stderr)
|
||||||
|
case "benchmark":
|
||||||
|
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||||
case "version":
|
case "version":
|
||||||
fmt.Fprintln(stdout, "usage: bee version")
|
fmt.Fprintln(stdout, "usage: bee version")
|
||||||
return 0
|
return 0
|
||||||
@@ -304,7 +296,7 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
|||||||
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
||||||
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
auditPath := fs.String("audit-path", "", "optional path to the latest audit JSON snapshot")
|
||||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||||
title := fs.String("title", "Bee Hardware Audit", "page title")
|
title := fs.String("title", "Bee Hardware Audit", "page title")
|
||||||
fs.Usage = func() {
|
fs.Usage = func() {
|
||||||
@@ -407,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
slog.Info("sat archive written", "target", target, "path", archive)
|
slog.Info("sat archive written", "target", target, "path", archive)
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||||
|
if len(args) == 0 {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||||
|
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
target := args[0]
|
||||||
|
if target != "nvidia" {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
|
||||||
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
|
||||||
|
devices := fs.String("devices", "", "comma-separated GPU indices to include")
|
||||||
|
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
|
||||||
|
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
|
||||||
|
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
|
||||||
|
if err := fs.Parse(args[1:]); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
includeIndices, err := parseBenchmarkIndexCSV(*devices)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
application := app.New(platform.New())
|
||||||
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||||
|
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: *profile,
|
||||||
|
SizeMB: *sizeMB,
|
||||||
|
GPUIndices: includeIndices,
|
||||||
|
ExcludeGPUIndices: excludeIndices,
|
||||||
|
RunNCCL: !*skipNCCL,
|
||||||
|
}, logLine)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("run benchmark", "target", target, "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
slog.Info("benchmark archive written", "target", target, "path", archive)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||||
|
raw = strings.TrimSpace(raw)
|
||||||
|
if raw == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for _, part := range strings.Split(raw, ",") {
|
||||||
|
part = strings.TrimSpace(part)
|
||||||
|
if part == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(part)
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return nil, fmt.Errorf("bad gpu index %q", part)
|
||||||
|
}
|
||||||
|
indices = append(indices, value)
|
||||||
|
}
|
||||||
|
return indices, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -46,8 +46,6 @@ func TestRunUnknownCommand(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunVersion(t *testing.T) {
|
func TestRunVersion(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
old := Version
|
old := Version
|
||||||
Version = "test-version"
|
Version = "test-version"
|
||||||
t.Cleanup(func() { Version = old })
|
t.Cleanup(func() { Version = old })
|
||||||
@@ -62,6 +60,16 @@ func TestRunVersion(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildLabelUsesVersionAsIs(t *testing.T) {
|
||||||
|
old := Version
|
||||||
|
Version = "1.2.3"
|
||||||
|
t.Cleanup(func() { Version = old })
|
||||||
|
|
||||||
|
if got := buildLabel(); got != "1.2.3" {
|
||||||
|
t.Fatalf("buildLabel=%q want %q", got, "1.2.3")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunExportRequiresTarget(t *testing.T) {
|
func TestRunExportRequiresTarget(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -19,17 +19,18 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
DefaultExportDir = "/appdata/bee/export"
|
DefaultExportDir = "/appdata/bee/export"
|
||||||
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json"
|
||||||
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log"
|
||||||
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
DefaultWebLogPath = DefaultExportDir + "/bee-web.log"
|
||||||
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log"
|
||||||
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log"
|
||||||
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log"
|
||||||
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json"
|
||||||
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
|
||||||
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
DefaultTechDumpDir = DefaultExportDir + "/techdump"
|
||||||
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
|
||||||
|
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark"
|
||||||
)
|
)
|
||||||
|
|
||||||
type App struct {
|
type App struct {
|
||||||
@@ -40,6 +41,8 @@ type App struct {
|
|||||||
sat satRunner
|
sat satRunner
|
||||||
runtime runtimeChecker
|
runtime runtimeChecker
|
||||||
installer installer
|
installer installer
|
||||||
|
// StatusDB is the unified component health store (nil if unavailable).
|
||||||
|
StatusDB *ComponentStatusDB
|
||||||
}
|
}
|
||||||
|
|
||||||
type ActionResult struct {
|
type ActionResult struct {
|
||||||
@@ -112,6 +115,12 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
|||||||
type satRunner interface {
|
type satRunner interface {
|
||||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
@@ -136,7 +145,7 @@ type runtimeChecker interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func New(platform *platform.System) *App {
|
func New(platform *platform.System) *App {
|
||||||
return &App{
|
a := &App{
|
||||||
network: platform,
|
network: platform,
|
||||||
services: platform,
|
services: platform,
|
||||||
exports: platform,
|
exports: platform,
|
||||||
@@ -145,6 +154,10 @@ func New(platform *platform.System) *App {
|
|||||||
runtime: platform,
|
runtime: platform,
|
||||||
installer: platform,
|
installer: platform,
|
||||||
}
|
}
|
||||||
|
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||||
|
a.StatusDB = db
|
||||||
|
}
|
||||||
|
return a
|
||||||
}
|
}
|
||||||
|
|
||||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||||
@@ -154,7 +167,7 @@ func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||||
return json.MarshalIndent(snap, "", " ")
|
return json.MarshalIndent(snap, "", " ")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,7 +187,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
result := collector.Run(runtimeMode)
|
result := collector.Run(runtimeMode)
|
||||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||||
result.Runtime = &health
|
result.Runtime = &health
|
||||||
}
|
}
|
||||||
@@ -189,10 +202,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return path, nil
|
return path, nil
|
||||||
@@ -217,10 +227,7 @@ func (a *App) RunRuntimePreflight(output string) (string, error) {
|
|||||||
return "stdout", err
|
return "stdout", err
|
||||||
case strings.HasPrefix(output, "file:"):
|
case strings.HasPrefix(output, "file:"):
|
||||||
path := strings.TrimPrefix(output, "file:")
|
path := strings.TrimPrefix(output, "file:")
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil {
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
if err := os.WriteFile(path, append(data, '\n'), 0644); err != nil {
|
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return path, nil
|
return path, nil
|
||||||
@@ -526,10 +533,56 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
|||||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBenchmarkBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
@@ -880,6 +933,12 @@ func latestSATSummaries() []string {
|
|||||||
prefix string
|
prefix string
|
||||||
}{
|
}{
|
||||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||||
|
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||||
|
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||||
|
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||||
|
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||||
|
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||||
|
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||||
{label: "Memory SAT", prefix: "memory-"},
|
{label: "Memory SAT", prefix: "memory-"},
|
||||||
{label: "Storage SAT", prefix: "storage-"},
|
{label: "Storage SAT", prefix: "storage-"},
|
||||||
{label: "CPU SAT", prefix: "cpu-"},
|
{label: "CPU SAT", prefix: "cpu-"},
|
||||||
|
|||||||
@@ -120,15 +120,21 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runMemoryFn func(string) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runStorageFn func(string) (string, error)
|
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||||
runCPUFn func(string, int) (string, error)
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
detectVendorFn func() string
|
runNvidiaPulseFn func(string, int, []int) (string, error)
|
||||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
runNvidiaBandwidthFn func(string, []int) (string, error)
|
||||||
runAMDPackFn func(string) (string, error)
|
runNvidiaTargetedStressFn func(string, int, []int) (string, error)
|
||||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
runMemoryFn func(string) (string, error)
|
||||||
|
runStorageFn func(string) (string, error)
|
||||||
|
runCPUFn func(string, int) (string, error)
|
||||||
|
detectVendorFn func() string
|
||||||
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
|
runAMDPackFn func(string) (string, error)
|
||||||
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||||
@@ -139,6 +145,48 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaBenchmarkFn != nil {
|
||||||
|
return f.runNvidiaBenchmarkFn(baseDir, opts)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaTargetedStressFn != nil {
|
||||||
|
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaComputeFn != nil {
|
||||||
|
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaTargetedPowerPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPowerFn != nil {
|
||||||
|
return f.runNvidiaPowerFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPulseTestPack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPulseFn != nil {
|
||||||
|
return f.runNvidiaPulseFn(baseDir, durationSec, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaBandwidthPack(_ context.Context, baseDir string, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaBandwidthFn != nil {
|
||||||
|
return f.runNvidiaBandwidthFn(baseDir, gpuIndices)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaStressFn != nil {
|
if f.runNvidiaStressFn != nil {
|
||||||
return f.runNvidiaStressFn(baseDir, opts)
|
return f.runNvidiaStressFn(baseDir, opts)
|
||||||
@@ -754,6 +802,26 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, want := range []string{
|
||||||
|
"/system/ip-link.txt",
|
||||||
|
"/system/ip-link-stats.txt",
|
||||||
|
"/system/ethtool-info.txt",
|
||||||
|
"/system/ethtool-link.txt",
|
||||||
|
"/system/ethtool-module.txt",
|
||||||
|
"/system/mstflint-query.txt",
|
||||||
|
} {
|
||||||
|
var found bool
|
||||||
|
for _, name := range names {
|
||||||
|
if contains(name, want) {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatalf("support bundle missing %s, names=%v", want, names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var foundRaw bool
|
var foundRaw bool
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
if contains(name, "/export/bee-sat/memory-run/verbose.log") {
|
||||||
|
|||||||
48
audit/internal/app/atomic_write.go
Normal file
48
audit/internal/app/atomic_write.go
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
)
|
||||||
|
|
||||||
|
func atomicWriteFile(path string, data []byte, perm os.FileMode) error {
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return fmt.Errorf("mkdir %s: %w", filepath.Dir(path), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tmpPath := path + ".tmp"
|
||||||
|
f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("open temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
success := false
|
||||||
|
defer func() {
|
||||||
|
_ = f.Close()
|
||||||
|
if !success {
|
||||||
|
_ = os.Remove(tmpPath)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if _, err := f.Write(data); err != nil {
|
||||||
|
return fmt.Errorf("write temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Sync(); err != nil {
|
||||||
|
return fmt.Errorf("sync temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return fmt.Errorf("close temp %s: %w", tmpPath, err)
|
||||||
|
}
|
||||||
|
if err := os.Rename(tmpPath, path); err != nil {
|
||||||
|
return fmt.Errorf("rename %s -> %s: %w", tmpPath, path, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if dir, err := os.Open(filepath.Dir(path)); err == nil {
|
||||||
|
_ = dir.Sync()
|
||||||
|
_ = dir.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
success = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
71
audit/internal/app/atomic_write_test.go
Normal file
71
audit/internal/app/atomic_write_test.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestAtomicWriteFileReplacesTargetWithoutLeavingTmp(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "bee-audit.json")
|
||||||
|
if err := os.WriteFile(path, []byte("old\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("seed file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := atomicWriteFile(path, []byte("new\n"), 0644); err != nil {
|
||||||
|
t.Fatalf("atomicWriteFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read final: %v", err)
|
||||||
|
}
|
||||||
|
if string(raw) != "new\n" {
|
||||||
|
t.Fatalf("final content=%q want %q", string(raw), "new\n")
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunRuntimePreflightWritesAtomically(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "runtime-health.json")
|
||||||
|
a := &App{
|
||||||
|
runtime: fakeRuntime{
|
||||||
|
collectFn: func(exportDir string) (schema.RuntimeHealth, error) {
|
||||||
|
return schema.RuntimeHealth{
|
||||||
|
Status: "OK",
|
||||||
|
ExportDir: exportDir,
|
||||||
|
DriverReady: true,
|
||||||
|
CUDAReady: true,
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := a.RunRuntimePreflight("file:" + path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RunRuntimePreflight: %v", err)
|
||||||
|
}
|
||||||
|
if got != path {
|
||||||
|
t.Fatalf("path=%q want %q", got, path)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("tmp file should be absent after success, err=%v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read runtime file: %v", err)
|
||||||
|
}
|
||||||
|
var health schema.RuntimeHealth
|
||||||
|
if err := json.Unmarshal(raw, &health); err != nil {
|
||||||
|
t.Fatalf("json unmarshal: %v", err)
|
||||||
|
}
|
||||||
|
if health.Status != "OK" {
|
||||||
|
t.Fatalf("status=%q want OK", health.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
268
audit/internal/app/component_status_db.go
Normal file
268
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||||
|
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||||
|
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||||
|
// the component stays at the highest observed severity until explicitly reset.
|
||||||
|
type ComponentStatusDB struct {
|
||||||
|
path string
|
||||||
|
mu sync.Mutex
|
||||||
|
records map[string]*ComponentStatusRecord
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||||
|
type ComponentStatusRecord struct {
|
||||||
|
ComponentKey string `json:"component_key"`
|
||||||
|
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||||
|
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||||
|
LastChangedAt time.Time `json:"last_changed_at"`
|
||||||
|
ErrorSummary string `json:"error_summary,omitempty"`
|
||||||
|
History []ComponentStatusEntry `json:"history"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ComponentStatusEntry is one observation written to a component's history.
|
||||||
|
type ComponentStatusEntry struct {
|
||||||
|
At time.Time `json:"at"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||||
|
Detail string `json:"detail,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||||
|
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||||
|
db := &ComponentStatusDB{
|
||||||
|
path: path,
|
||||||
|
records: make(map[string]*ComponentStatusRecord),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil && !os.IsNotExist(err) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(data) > 0 {
|
||||||
|
var records []ComponentStatusRecord
|
||||||
|
if err := json.Unmarshal(data, &records); err == nil {
|
||||||
|
for i := range records {
|
||||||
|
db.records[records[i].ComponentKey] = &records[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return db, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record writes one observation for the given component key.
|
||||||
|
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||||
|
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||||
|
// OK never downgrades an existing Warning or Critical status.
|
||||||
|
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||||
|
if db == nil || strings.TrimSpace(key) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
|
||||||
|
now := time.Now().UTC()
|
||||||
|
rec, exists := db.records[key]
|
||||||
|
if !exists {
|
||||||
|
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||||
|
db.records[key] = rec
|
||||||
|
}
|
||||||
|
rec.LastCheckedAt = now
|
||||||
|
|
||||||
|
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||||
|
rec.History = append(rec.History, entry)
|
||||||
|
|
||||||
|
// Status merge: OK never downgrades Warning/Critical.
|
||||||
|
newSev := componentSeverity(status)
|
||||||
|
curSev := componentSeverity(rec.Status)
|
||||||
|
if newSev > curSev {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
rec.ErrorSummary = detail
|
||||||
|
} else if rec.Status == "" {
|
||||||
|
rec.Status = status
|
||||||
|
rec.LastChangedAt = now
|
||||||
|
}
|
||||||
|
|
||||||
|
_ = db.saveLocked()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get returns the current record for a component key.
|
||||||
|
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||||
|
if db == nil {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
r, ok := db.records[key]
|
||||||
|
if !ok {
|
||||||
|
return ComponentStatusRecord{}, false
|
||||||
|
}
|
||||||
|
return *r, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// All returns a snapshot of all records.
|
||||||
|
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||||
|
if db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
db.mu.Lock()
|
||||||
|
defer db.mu.Unlock()
|
||||||
|
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
out = append(out, *r)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *ComponentStatusDB) saveLocked() error {
|
||||||
|
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||||
|
for _, r := range db.records {
|
||||||
|
records = append(records, *r)
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(records, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(db.path, data, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
// componentSeverity returns a numeric severity so higher values win.
|
||||||
|
func componentSeverity(status string) int {
|
||||||
|
switch strings.TrimSpace(status) {
|
||||||
|
case "Critical":
|
||||||
|
return 3
|
||||||
|
case "Warning":
|
||||||
|
return 2
|
||||||
|
case "OK":
|
||||||
|
return 1
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||||
|
// and writes component status records to db for the given SAT target.
|
||||||
|
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||||
|
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||||
|
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
archivePath = extractArchivePath(archivePath)
|
||||||
|
if archivePath == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
if overall == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
source := "sat:" + target
|
||||||
|
dbStatus := satStatusToDBStatus(overall)
|
||||||
|
|
||||||
|
// Map SAT target to component keys.
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "amd", "nvidia-stress",
|
||||||
|
"amd-stress", "amd-mem", "amd-bandwidth":
|
||||||
|
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "memory", "memory-stress", "sat-stress":
|
||||||
|
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "cpu", "platform-stress":
|
||||||
|
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||||
|
case "storage":
|
||||||
|
// Try to record per-device if available in summary.
|
||||||
|
recordedAny := false
|
||||||
|
for key, val := range kv {
|
||||||
|
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
base := strings.TrimSuffix(key, "_status")
|
||||||
|
idx := strings.Index(base, "_")
|
||||||
|
if idx <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devName := base[:idx]
|
||||||
|
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||||
|
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||||
|
recordedAny = true
|
||||||
|
}
|
||||||
|
if !recordedAny {
|
||||||
|
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func satStatusToDBStatus(overall string) string {
|
||||||
|
switch overall {
|
||||||
|
case "OK":
|
||||||
|
return "OK"
|
||||||
|
case "FAILED":
|
||||||
|
return "Warning"
|
||||||
|
case "PARTIAL", "UNSUPPORTED":
|
||||||
|
return "Unknown"
|
||||||
|
default:
|
||||||
|
return "Unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||||
|
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||||
|
func ExtractArchivePath(s string) string {
|
||||||
|
return extractArchivePath(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||||
|
// file located in the run directory alongside archivePath.
|
||||||
|
// Returns "" if the file cannot be read.
|
||||||
|
func ReadSATOverallStatus(archivePath string) string {
|
||||||
|
if strings.TrimSpace(archivePath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
kv := parseSATKV(string(data))
|
||||||
|
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractArchivePath(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if strings.HasSuffix(s, ".tar.gz") {
|
||||||
|
parts := strings.Fields(s)
|
||||||
|
if len(parts) > 0 {
|
||||||
|
return parts[len(parts)-1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSATKV(raw string) map[string]string {
|
||||||
|
kv := make(map[string]string)
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||||
|
if ok {
|
||||||
|
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return kv
|
||||||
|
}
|
||||||
@@ -9,7 +9,7 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
|||||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||||
applyStorageSAT(snap.Storage, summary)
|
applyStorageSAT(snap.Storage, summary)
|
||||||
}
|
}
|
||||||
|
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||||
|
applyComponentStatusDB(snap, db)
|
||||||
}
|
}
|
||||||
|
|
||||||
type satSummary struct {
|
type satSummary struct {
|
||||||
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||||
|
if snap == nil || db == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, rec := range db.All() {
|
||||||
|
key := rec.ComponentKey
|
||||||
|
status := dbStatusToSATStatus(rec.Status)
|
||||||
|
if status == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
detail := rec.ErrorSummary
|
||||||
|
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(key, "pcie:"):
|
||||||
|
bdf := strings.TrimPrefix(key, "pcie:")
|
||||||
|
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||||
|
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||||
|
if sanitizeBDFForLookup(bdf) == "" {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
normalized := sanitizeBDFForLookup(bdf)
|
||||||
|
for i := range snap.PCIeDevices {
|
||||||
|
if snap.PCIeDevices[i].BDF == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||||
|
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "storage:"):
|
||||||
|
devName := strings.TrimPrefix(key, "storage:")
|
||||||
|
if devName == "all" {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for i := range snap.Storage {
|
||||||
|
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||||
|
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||||
|
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "memory:"):
|
||||||
|
for i := range snap.Memory {
|
||||||
|
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
case strings.HasPrefix(key, "cpu:"):
|
||||||
|
for i := range snap.CPUs {
|
||||||
|
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||||
|
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||||
|
func dbStatusToSATStatus(s string) string {
|
||||||
|
switch strings.TrimSpace(s) {
|
||||||
|
case "OK", "Warning", "Critical", "Unknown":
|
||||||
|
return s
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||||
|
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||||
|
func sanitizeBDFForLookup(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
bdf = "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
func ptrString(v *string) string {
|
func ptrString(v *string) string {
|
||||||
if v == nil {
|
if v == nil {
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
|||||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
|
|
||||||
applyLatestSATStatuses(&snap, baseDir)
|
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||||
|
|
||||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-network.service",
|
"bee-network.service",
|
||||||
"bee-nvidia.service",
|
"bee-nvidia.service",
|
||||||
"bee-preflight.service",
|
"bee-preflight.service",
|
||||||
|
"bee-selfheal.service",
|
||||||
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,6 +34,8 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||||
|
{name: "system/ip-link.txt", cmd: []string{"ip", "-details", "link", "show"}},
|
||||||
|
{name: "system/ip-link-stats.txt", cmd: []string{"ip", "-s", "link", "show"}},
|
||||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
@@ -47,6 +51,83 @@ for d in /sys/bus/pci/devices/*/; do
|
|||||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-info.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool -i "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-link.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/ethtool-module.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v ethtool >/dev/null 2>&1; then
|
||||||
|
echo "ethtool not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/class/net/*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
iface=$(basename "$path")
|
||||||
|
[ "$iface" = "lo" ] && continue
|
||||||
|
found=1
|
||||||
|
echo "=== $iface ==="
|
||||||
|
ethtool -m "$iface" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no interfaces found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/mstflint-query.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v mstflint >/dev/null 2>&1; then
|
||||||
|
echo "mstflint not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for path in /sys/bus/pci/devices/*; do
|
||||||
|
[ -e "$path/vendor" ] || continue
|
||||||
|
vendor=$(cat "$path/vendor" 2>/dev/null)
|
||||||
|
[ "$vendor" = "0x15b3" ] || continue
|
||||||
|
bdf=$(basename "$path")
|
||||||
|
found=1
|
||||||
|
echo "=== $bdf ==="
|
||||||
|
mstflint -d "$bdf" q 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no Mellanox/NVIDIA networking devices found"
|
||||||
|
fi
|
||||||
`}},
|
`}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,18 +2,21 @@ package collector
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
|
"context"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
const mellanoxVendorID = 0x15b3
|
const mellanoxVendorID = 0x15b3
|
||||||
|
const nicProbeTimeout = 2 * time.Second
|
||||||
|
|
||||||
var (
|
var (
|
||||||
mstflintQuery = func(bdf string) (string, error) {
|
mstflintQuery = func(bdf string) (string, error) {
|
||||||
out, err := exec.Command("mstflint", "-d", bdf, "q").Output()
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "mstflint", "-d", bdf, "q")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -21,7 +24,7 @@ var (
|
|||||||
}
|
}
|
||||||
|
|
||||||
ethtoolInfoQuery = func(iface string) (string, error) {
|
ethtoolInfoQuery = func(iface string) (string, error) {
|
||||||
out, err := exec.Command("ethtool", "-i", iface).Output()
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-i", iface)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -29,6 +32,14 @@ var (
|
|||||||
}
|
}
|
||||||
|
|
||||||
netIfacesByBDF = listNetIfacesByBDF
|
netIfacesByBDF = listNetIfacesByBDF
|
||||||
|
readNetCarrierFile = func(iface string) (string, error) {
|
||||||
|
path := filepath.Join("/sys/class/net", iface, "carrier")
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(raw)), nil
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
// enrichPCIeWithMellanox enriches Mellanox/NVIDIA Networking devices with
|
||||||
@@ -162,3 +173,17 @@ func listNetIfacesByBDF(bdf string) []string {
|
|||||||
}
|
}
|
||||||
return ifaces
|
return ifaces
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func commandOutputWithTimeout(timeout time.Duration, name string, args ...string) ([]byte, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||||
|
defer cancel()
|
||||||
|
return exec.CommandContext(ctx, name, args...).Output()
|
||||||
|
}
|
||||||
|
|
||||||
|
func interfaceHasCarrier(iface string) bool {
|
||||||
|
raw, err := readNetCarrierFile(iface)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(raw) == "1"
|
||||||
|
}
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ import (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
ethtoolModuleQuery = func(iface string) (string, error) {
|
ethtoolModuleQuery = func(iface string) (string, error) {
|
||||||
out, err := raidToolQuery("ethtool", "-m", iface)
|
out, err := commandOutputWithTimeout(nicProbeTimeout, "ethtool", "-m", iface)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -58,10 +58,12 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if interfaceHasCarrier(iface) {
|
||||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
enriched++
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
continue
|
enriched++
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
origReadMAC := readNetAddressFile
|
origReadMAC := readNetAddressFile
|
||||||
origEth := ethtoolInfoQuery
|
origEth := ethtoolInfoQuery
|
||||||
origModule := ethtoolModuleQuery
|
origModule := ethtoolModuleQuery
|
||||||
|
origCarrier := readNetCarrierFile
|
||||||
t.Cleanup(func() {
|
t.Cleanup(func() {
|
||||||
queryPCILSPCIDetail = origDetail
|
queryPCILSPCIDetail = origDetail
|
||||||
readPCIVPDFile = origVPD
|
readPCIVPDFile = origVPD
|
||||||
@@ -64,6 +65,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
readNetAddressFile = origReadMAC
|
readNetAddressFile = origReadMAC
|
||||||
ethtoolInfoQuery = origEth
|
ethtoolInfoQuery = origEth
|
||||||
ethtoolModuleQuery = origModule
|
ethtoolModuleQuery = origModule
|
||||||
|
readNetCarrierFile = origCarrier
|
||||||
})
|
})
|
||||||
|
|
||||||
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
queryPCILSPCIDetail = func(bdf string) (string, error) {
|
||||||
@@ -82,6 +84,7 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return "aa:bb:cc:dd:ee:ff", nil
|
return "aa:bb:cc:dd:ee:ff", nil
|
||||||
}
|
}
|
||||||
|
readNetCarrierFile = func(string) (string, error) { return "1", nil }
|
||||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("skip optics") }
|
||||||
|
|
||||||
@@ -101,6 +104,42 @@ func TestEnrichPCIeWithNICTelemetryAddsSerialFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T) {
|
||||||
|
origIfaces := netIfacesByBDF
|
||||||
|
origReadMAC := readNetAddressFile
|
||||||
|
origEth := ethtoolInfoQuery
|
||||||
|
origModule := ethtoolModuleQuery
|
||||||
|
origCarrier := readNetCarrierFile
|
||||||
|
t.Cleanup(func() {
|
||||||
|
netIfacesByBDF = origIfaces
|
||||||
|
readNetAddressFile = origReadMAC
|
||||||
|
ethtoolInfoQuery = origEth
|
||||||
|
ethtoolModuleQuery = origModule
|
||||||
|
readNetCarrierFile = origCarrier
|
||||||
|
})
|
||||||
|
|
||||||
|
netIfacesByBDF = func(string) []string { return []string{"eth0"} }
|
||||||
|
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||||
|
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||||
|
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||||
|
ethtoolModuleQuery = func(string) (string, error) {
|
||||||
|
t.Fatal("ethtool -m should not be called without carrier")
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
class := "EthernetController"
|
||||||
|
bdf := "0000:18:00.0"
|
||||||
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
DeviceClass: &class,
|
||||||
|
BDF: &bdf,
|
||||||
|
}}
|
||||||
|
|
||||||
|
out := enrichPCIeWithNICTelemetry(devs)
|
||||||
|
if len(out[0].MacAddresses) != 1 || out[0].MacAddresses[0] != "aa:bb:cc:dd:ee:ff" {
|
||||||
|
t.Fatalf("mac_addresses=%v", out[0].MacAddresses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDBMValue(t *testing.T) {
|
func TestDBMValue(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
in string
|
in string
|
||||||
|
|||||||
1493
audit/internal/platform/benchmark.go
Normal file
1493
audit/internal/platform/benchmark.go
Normal file
File diff suppressed because it is too large
Load Diff
252
audit/internal/platform/benchmark_report.go
Normal file
252
audit/internal/platform/benchmark_report.go
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||||
|
return renderBenchmarkReportWithCharts(result, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
type benchmarkReportChart struct {
|
||||||
|
Title string
|
||||||
|
Content string
|
||||||
|
}
|
||||||
|
|
||||||
|
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||||||
|
|
||||||
|
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
||||||
|
fmt.Fprintf(&b, "===========================\n\n")
|
||||||
|
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
|
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
|
||||||
|
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
|
||||||
|
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
|
||||||
|
|
||||||
|
if len(result.Findings) > 0 {
|
||||||
|
fmt.Fprintf(&b, "Executive Summary\n")
|
||||||
|
fmt.Fprintf(&b, "-----------------\n")
|
||||||
|
for _, finding := range result.Findings {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", finding)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(result.Warnings) > 0 {
|
||||||
|
fmt.Fprintf(&b, "Warnings\n")
|
||||||
|
fmt.Fprintf(&b, "--------\n")
|
||||||
|
for _, warning := range result.Warnings {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", warning)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "Per GPU Scorecard\n")
|
||||||
|
fmt.Fprintf(&b, "-----------------\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
|
||||||
|
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
||||||
|
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
||||||
|
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
||||||
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
|
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
||||||
|
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
||||||
|
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
||||||
|
if gpu.Scores.InterconnectScore > 0 {
|
||||||
|
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
|
||||||
|
}
|
||||||
|
if len(gpu.DegradationReasons) > 0 {
|
||||||
|
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
|
||||||
|
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
|
||||||
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
|
fmt.Fprintf(&b, " Precision results:\n")
|
||||||
|
for _, precision := range gpu.PrecisionResults {
|
||||||
|
if precision.Supported {
|
||||||
|
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
||||||
|
if len(gpu.Notes) > 0 {
|
||||||
|
fmt.Fprintf(&b, " Notes:\n")
|
||||||
|
for _, note := range gpu.Notes {
|
||||||
|
fmt.Fprintf(&b, " - %s\n", note)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
fmt.Fprintf(&b, "Interconnect\n")
|
||||||
|
fmt.Fprintf(&b, "------------\n")
|
||||||
|
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
|
||||||
|
if result.Interconnect.Supported {
|
||||||
|
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
|
||||||
|
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||||
|
}
|
||||||
|
for _, note := range result.Interconnect.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(charts) > 0 {
|
||||||
|
fmt.Fprintf(&b, "Terminal Charts\n")
|
||||||
|
fmt.Fprintf(&b, "---------------\n")
|
||||||
|
for _, chart := range charts {
|
||||||
|
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||||
|
if content == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s\n", chart.Title)
|
||||||
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
|
||||||
|
fmt.Fprintf(&b, "%s\n\n", content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if sp := result.ServerPower; sp != nil {
|
||||||
|
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
||||||
|
fmt.Fprintf(&b, "-------------------\n")
|
||||||
|
if !sp.Available {
|
||||||
|
fmt.Fprintf(&b, "Unavailable\n")
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
||||||
|
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
||||||
|
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
||||||
|
if sp.ReportingRatio > 0 {
|
||||||
|
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, note := range sp.Notes {
|
||||||
|
fmt.Fprintf(&b, " Note: %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "Methodology\n")
|
||||||
|
fmt.Fprintf(&b, "-----------\n")
|
||||||
|
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
|
||||||
|
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||||
|
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "Raw Files\n")
|
||||||
|
fmt.Fprintf(&b, "---------\n")
|
||||||
|
fmt.Fprintf(&b, "- result.json\n")
|
||||||
|
fmt.Fprintf(&b, "- report.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- summary.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- verbose.log\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
|
||||||
|
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||||
|
phases := []struct {
|
||||||
|
name string
|
||||||
|
label string
|
||||||
|
}{
|
||||||
|
{name: "baseline", label: "Baseline"},
|
||||||
|
{name: "steady", label: "Steady State"},
|
||||||
|
{name: "cooldown", label: "Cooldown"},
|
||||||
|
}
|
||||||
|
var charts []benchmarkReportChart
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
for _, phase := range phases {
|
||||||
|
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil || len(raw) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
charts = append(charts, benchmarkReportChart{
|
||||||
|
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
|
||||||
|
Content: string(raw),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return charts
|
||||||
|
}
|
||||||
|
|
||||||
|
func stripANSIEscapeSequences(raw string) string {
|
||||||
|
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||||
|
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||||
|
// duration is unknown (0), raw seconds are shown instead.
|
||||||
|
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||||
|
type counter struct {
|
||||||
|
label string
|
||||||
|
us uint64
|
||||||
|
}
|
||||||
|
counters := []counter{
|
||||||
|
{"sw_power", t.SWPowerCapUS},
|
||||||
|
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||||
|
{"sync_boost", t.SyncBoostUS},
|
||||||
|
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||||
|
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||||
|
}
|
||||||
|
var parts []string
|
||||||
|
for _, c := range counters {
|
||||||
|
if c.us == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sec := float64(c.us) / 1e6
|
||||||
|
if steadyDurationSec > 0 {
|
||||||
|
pct := sec / steadyDurationSec * 100
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||||
|
} else if sec < 1 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(parts) == 0 {
|
||||||
|
return "none"
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||||
|
fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status)
|
||||||
|
var best float64
|
||||||
|
for i, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status)
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore)
|
||||||
|
if i == 0 || gpu.Scores.CompositeScore > best {
|
||||||
|
best = gpu.Scores.CompositeScore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "best_composite_score=%.2f\n", best)
|
||||||
|
if result.Interconnect != nil {
|
||||||
|
fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status)
|
||||||
|
fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps)
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
179
audit/internal/platform/benchmark_test.go
Normal file
179
audit/internal/platform/benchmark_test.go
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestResolveBenchmarkProfile(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
profile string
|
||||||
|
want benchmarkProfileSpec
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "default",
|
||||||
|
profile: "",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "stability",
|
||||||
|
profile: "stability",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "overnight",
|
||||||
|
profile: "overnight",
|
||||||
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
tc := tc
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
got := resolveBenchmarkProfile(tc.profile)
|
||||||
|
if got != tc.want {
|
||||||
|
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
|
||||||
|
Profile: "stability",
|
||||||
|
RunNCCL: false,
|
||||||
|
})
|
||||||
|
if opts.Profile != NvidiaBenchmarkProfileStability {
|
||||||
|
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
|
||||||
|
}
|
||||||
|
if opts.RunNCCL {
|
||||||
|
t.Fatalf("RunNCCL should stay false when explicitly disabled")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseBenchmarkBurnLog(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
raw := strings.Join([]string{
|
||||||
|
"loader=bee-gpu-burn",
|
||||||
|
"[gpu 0] device=NVIDIA H100",
|
||||||
|
"[gpu 0] compute_capability=9.0",
|
||||||
|
"[gpu 0] backend=cublasLt",
|
||||||
|
"[gpu 0] duration_s=10",
|
||||||
|
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] fp16_tensor_iterations=200",
|
||||||
|
"[gpu 0] fp8_e4m3_iterations=50",
|
||||||
|
"[gpu 0] status=OK",
|
||||||
|
}, "\n")
|
||||||
|
|
||||||
|
got := parseBenchmarkBurnLog(raw)
|
||||||
|
if got.Backend != "cublasLt" {
|
||||||
|
t.Fatalf("backend=%q want cublasLt", got.Backend)
|
||||||
|
}
|
||||||
|
if got.ComputeCapability != "9.0" {
|
||||||
|
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||||
|
}
|
||||||
|
if len(got.Profiles) != 2 {
|
||||||
|
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
||||||
|
}
|
||||||
|
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||||
|
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||||
|
}
|
||||||
|
if got.Profiles[1].Category != "fp8" {
|
||||||
|
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
result := NvidiaBenchmarkResult{
|
||||||
|
BenchmarkVersion: benchmarkVersion,
|
||||||
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
|
OverallStatus: "PARTIAL",
|
||||||
|
SelectedGPUIndices: []int{0},
|
||||||
|
Normalization: BenchmarkNormalization{
|
||||||
|
Status: "partial",
|
||||||
|
},
|
||||||
|
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
|
||||||
|
GPUs: []BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100",
|
||||||
|
Status: "OK",
|
||||||
|
Steady: BenchmarkTelemetrySummary{
|
||||||
|
AvgPowerW: 680,
|
||||||
|
AvgTempC: 79,
|
||||||
|
AvgGraphicsClockMHz: 1725,
|
||||||
|
P95PowerW: 700,
|
||||||
|
P95TempC: 82,
|
||||||
|
P95GraphicsClockMHz: 1800,
|
||||||
|
},
|
||||||
|
Scores: BenchmarkScorecard{
|
||||||
|
ComputeScore: 1200,
|
||||||
|
PowerSustainScore: 96,
|
||||||
|
ThermalSustainScore: 88,
|
||||||
|
StabilityScore: 92,
|
||||||
|
CompositeScore: 1176,
|
||||||
|
},
|
||||||
|
PrecisionResults: []BenchmarkPrecisionResult{
|
||||||
|
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
|
||||||
|
},
|
||||||
|
Throttle: BenchmarkThrottleCounters{
|
||||||
|
SWPowerCapUS: 1000000,
|
||||||
|
},
|
||||||
|
DegradationReasons: []string{"power_capped"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
report := renderBenchmarkReport(result)
|
||||||
|
for _, needle := range []string{
|
||||||
|
"Executive Summary",
|
||||||
|
"GPU 0 spent measurable time under SW power cap.",
|
||||||
|
"Composite score: 1176.00",
|
||||||
|
"fp16_tensor: 700.00 TOPS",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(report, needle) {
|
||||||
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
|
||||||
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
|
OverallStatus: "OK",
|
||||||
|
SelectedGPUIndices: []int{0},
|
||||||
|
Normalization: BenchmarkNormalization{
|
||||||
|
Status: "full",
|
||||||
|
},
|
||||||
|
}, []benchmarkReportChart{
|
||||||
|
{
|
||||||
|
Title: "GPU 0 Steady State",
|
||||||
|
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, needle := range []string{
|
||||||
|
"Terminal Charts",
|
||||||
|
"GPU 0 Steady State",
|
||||||
|
"GPU 0 chart",
|
||||||
|
"42┤───",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(report, needle) {
|
||||||
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.Contains(report, "\x1b[31m") {
|
||||||
|
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
||||||
|
}
|
||||||
|
}
|
||||||
158
audit/internal/platform/benchmark_types.go
Normal file
158
audit/internal/platform/benchmark_types.go
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
const (
|
||||||
|
NvidiaBenchmarkProfileStandard = "standard"
|
||||||
|
NvidiaBenchmarkProfileStability = "stability"
|
||||||
|
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaBenchmarkOptions struct {
|
||||||
|
Profile string
|
||||||
|
SizeMB int
|
||||||
|
GPUIndices []int
|
||||||
|
ExcludeGPUIndices []int
|
||||||
|
RunNCCL bool
|
||||||
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
type NvidiaBenchmarkResult struct {
|
||||||
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
|
OverallStatus string `json:"overall_status"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkNormalization struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkNormalizationGPU struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
PersistenceMode string `json:"persistence_mode,omitempty"`
|
||||||
|
GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"`
|
||||||
|
GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"`
|
||||||
|
MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"`
|
||||||
|
MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkGPUResult struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
UUID string `json:"uuid,omitempty"`
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
BusID string `json:"bus_id,omitempty"`
|
||||||
|
VBIOS string `json:"vbios,omitempty"`
|
||||||
|
ComputeCapability string `json:"compute_capability,omitempty"`
|
||||||
|
Backend string `json:"backend,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||||
|
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||||
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||||
|
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||||
|
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||||
|
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||||
|
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||||
|
Baseline BenchmarkTelemetrySummary `json:"baseline"`
|
||||||
|
Steady BenchmarkTelemetrySummary `json:"steady"`
|
||||||
|
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
|
||||||
|
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
|
||||||
|
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
|
||||||
|
Scores BenchmarkScorecard `json:"scores"`
|
||||||
|
DegradationReasons []string `json:"degradation_reasons,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkTelemetrySummary struct {
|
||||||
|
DurationSec float64 `json:"duration_sec"`
|
||||||
|
Samples int `json:"samples"`
|
||||||
|
AvgTempC float64 `json:"avg_temp_c"`
|
||||||
|
P95TempC float64 `json:"p95_temp_c"`
|
||||||
|
AvgPowerW float64 `json:"avg_power_w"`
|
||||||
|
P95PowerW float64 `json:"p95_power_w"`
|
||||||
|
AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"`
|
||||||
|
P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"`
|
||||||
|
AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"`
|
||||||
|
P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"`
|
||||||
|
AvgUsagePct float64 `json:"avg_usage_pct"`
|
||||||
|
AvgMemUsagePct float64 `json:"avg_mem_usage_pct"`
|
||||||
|
ClockCVPct float64 `json:"clock_cv_pct"`
|
||||||
|
PowerCVPct float64 `json:"power_cv_pct"`
|
||||||
|
TempCVPct float64 `json:"temp_cv_pct"`
|
||||||
|
ClockDriftPct float64 `json:"clock_drift_pct"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkThrottleCounters struct {
|
||||||
|
SWPowerCapUS uint64 `json:"sw_power_cap_us"`
|
||||||
|
SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"`
|
||||||
|
SyncBoostUS uint64 `json:"sync_boost_us"`
|
||||||
|
HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"`
|
||||||
|
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPrecisionResult struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Category string `json:"category"`
|
||||||
|
Supported bool `json:"supported"`
|
||||||
|
Lanes int `json:"lanes,omitempty"`
|
||||||
|
M uint64 `json:"m,omitempty"`
|
||||||
|
N uint64 `json:"n,omitempty"`
|
||||||
|
K uint64 `json:"k,omitempty"`
|
||||||
|
Iterations uint64 `json:"iterations,omitempty"`
|
||||||
|
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||||
|
Notes string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkScorecard struct {
|
||||||
|
ComputeScore float64 `json:"compute_score"`
|
||||||
|
PowerSustainScore float64 `json:"power_sustain_score"`
|
||||||
|
ThermalSustainScore float64 `json:"thermal_sustain_score"`
|
||||||
|
StabilityScore float64 `json:"stability_score"`
|
||||||
|
InterconnectScore float64 `json:"interconnect_score"`
|
||||||
|
CompositeScore float64 `json:"composite_score"`
|
||||||
|
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||||
|
// Comparable across throttle levels and GPU generations. Low value at normal
|
||||||
|
// clocks indicates silicon degradation.
|
||||||
|
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||||
|
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||||
|
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||||
|
// over-reporting its power consumption.
|
||||||
|
type BenchmarkServerPower struct {
|
||||||
|
Available bool `json:"available"`
|
||||||
|
IdleW float64 `json:"idle_w,omitempty"`
|
||||||
|
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"`
|
||||||
|
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||||
|
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkInterconnectResult struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Attempted bool `json:"attempted"`
|
||||||
|
Supported bool `json:"supported"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"`
|
||||||
|
AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"`
|
||||||
|
MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"`
|
||||||
|
AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"`
|
||||||
|
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import "regexp"
|
||||||
|
|
||||||
|
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||||
|
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||||
|
type ErrorPattern struct {
|
||||||
|
// Name is a short machine-readable label for logging and deduplication.
|
||||||
|
Name string
|
||||||
|
// Re is the compiled regular expression matched against a single kmsg line.
|
||||||
|
Re *regexp.Regexp
|
||||||
|
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||||
|
Category string
|
||||||
|
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||||
|
Severity string
|
||||||
|
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||||
|
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||||
|
BDFGroup int
|
||||||
|
// DevGroup is the capture group index (1-based) that contains a device name
|
||||||
|
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||||
|
DevGroup int
|
||||||
|
}
|
||||||
|
|
||||||
|
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||||
|
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||||
|
var HardwareErrorPatterns = []ErrorPattern{
|
||||||
|
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "nvidia-rminitadapter",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-msi-fail",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-aer",
|
||||||
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvidia-xid",
|
||||||
|
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "pcie-aer",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-uncorrectable",
|
||||||
|
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "pcie-link-down",
|
||||||
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "blk-io-error",
|
||||||
|
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-timeout",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "scsi-failed",
|
||||||
|
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "nvme-reset",
|
||||||
|
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||||
|
Category: "storage",
|
||||||
|
Severity: "warning",
|
||||||
|
DevGroup: 1,
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "mce-hardware-error",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "mce-corrected",
|
||||||
|
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||||
|
Category: "mce",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
|
||||||
|
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
Name: "edac-ue",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "edac-ce",
|
||||||
|
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||||
|
Category: "memory",
|
||||||
|
Severity: "warning",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustPat(s string) *regexp.Regexp {
|
||||||
|
return regexp.MustCompile(s)
|
||||||
|
}
|
||||||
@@ -20,12 +20,13 @@ type GPUMetricRow struct {
|
|||||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||||
PowerW float64 `json:"power_w"`
|
PowerW float64 `json:"power_w"`
|
||||||
ClockMHz float64 `json:"clock_mhz"`
|
ClockMHz float64 `json:"clock_mhz"`
|
||||||
|
MemClockMHz float64 `json:"mem_clock_mhz"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||||
args := []string{
|
args := []string{
|
||||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
|
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
}
|
}
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
@@ -46,7 +47,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
parts := strings.Split(line, ", ")
|
parts := strings.Split(line, ", ")
|
||||||
if len(parts) < 6 {
|
if len(parts) < 7 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||||
@@ -57,6 +58,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
MemUsagePct: parseGPUFloat(parts[3]),
|
MemUsagePct: parseGPUFloat(parts[3]),
|
||||||
PowerW: parseGPUFloat(parts[4]),
|
PowerW: parseGPUFloat(parts[4]),
|
||||||
ClockMHz: parseGPUFloat(parts[5]),
|
ClockMHz: parseGPUFloat(parts[5]),
|
||||||
|
MemClockMHz: parseGPUFloat(parts[6]),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return rows, nil
|
return rows, nil
|
||||||
@@ -139,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
|||||||
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
// WriteGPUMetricsCSV writes collected rows as a CSV file.
|
||||||
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz\n")
|
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
|
||||||
for _, r := range rows {
|
for _, r := range rows {
|
||||||
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f\n",
|
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
|
||||||
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz)
|
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
|
||||||
}
|
}
|
||||||
return os.WriteFile(path, b.Bytes(), 0644)
|
return os.WriteFile(path, b.Bytes(), 0644)
|
||||||
}
|
}
|
||||||
@@ -197,7 +199,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
const PW = plotX2 - plotX1
|
const PW = plotX2 - plotX1
|
||||||
const PH = plotY2 - plotY1
|
const PH = plotY2 - plotY1
|
||||||
// Outer axes
|
// Outer axes
|
||||||
const tempAxisX = 60 // temp axis line
|
const tempAxisX = 60 // temp axis line
|
||||||
const clockAxisX = 900 // clock axis line
|
const clockAxisX = 900 // clock axis line
|
||||||
|
|
||||||
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"}
|
||||||
|
|||||||
@@ -120,10 +120,45 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) erro
|
|||||||
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
log(fmt.Sprintf("Warning: rebind /run/live/medium failed: %v", err))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log("Verifying live medium now served from RAM...")
|
||||||
|
status := s.LiveBootSource()
|
||||||
|
if err := verifyInstallToRAMStatus(status); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||||
log("Done. Installation media can be safely disconnected.")
|
log("Done. Installation media can be safely disconnected.")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func verifyInstallToRAMStatus(status LiveBootSource) error {
|
||||||
|
if status.InRAM {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("install to RAM verification failed: live medium still mounted from %s", describeLiveBootSource(status))
|
||||||
|
}
|
||||||
|
|
||||||
|
func describeLiveBootSource(status LiveBootSource) string {
|
||||||
|
source := strings.TrimSpace(status.Device)
|
||||||
|
if source == "" {
|
||||||
|
source = strings.TrimSpace(status.Source)
|
||||||
|
}
|
||||||
|
if source == "" {
|
||||||
|
source = "unknown source"
|
||||||
|
}
|
||||||
|
switch strings.TrimSpace(status.Kind) {
|
||||||
|
case "ram":
|
||||||
|
return "RAM"
|
||||||
|
case "usb":
|
||||||
|
return "USB (" + source + ")"
|
||||||
|
case "cdrom":
|
||||||
|
return "CD-ROM (" + source + ")"
|
||||||
|
case "disk":
|
||||||
|
return "disk (" + source + ")"
|
||||||
|
default:
|
||||||
|
return source
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
func copyFileLarge(ctx context.Context, src, dst string, logFunc func(string)) error {
|
||||||
in, err := os.Open(src)
|
in, err := os.Open(src)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package platform
|
|||||||
import "testing"
|
import "testing"
|
||||||
|
|
||||||
func TestInferLiveBootKind(t *testing.T) {
|
func TestInferLiveBootKind(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
fsType string
|
fsType string
|
||||||
@@ -18,6 +20,7 @@ func TestInferLiveBootKind(t *testing.T) {
|
|||||||
{name: "unknown", source: "overlay", want: "unknown"},
|
{name: "unknown", source: "overlay", want: "unknown"},
|
||||||
}
|
}
|
||||||
for _, tc := range tests {
|
for _, tc := range tests {
|
||||||
|
tc := tc
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||||
if got != tc.want {
|
if got != tc.want {
|
||||||
@@ -26,3 +29,29 @@ func TestInferLiveBootKind(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestVerifyInstallToRAMStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if err := verifyInstallToRAMStatus(LiveBootSource{InRAM: true, Kind: "ram", Source: "tmpfs"}); err != nil {
|
||||||
|
t.Fatalf("expected success for RAM-backed status, got %v", err)
|
||||||
|
}
|
||||||
|
err := verifyInstallToRAMStatus(LiveBootSource{InRAM: false, Kind: "usb", Device: "/dev/sdb1"})
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected verification failure when media is still on USB")
|
||||||
|
}
|
||||||
|
if got := err.Error(); got != "install to RAM verification failed: live medium still mounted from USB (/dev/sdb1)" {
|
||||||
|
t.Fatalf("error=%q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDescribeLiveBootSource(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
if got := describeLiveBootSource(LiveBootSource{InRAM: true, Kind: "ram"}); got != "RAM" {
|
||||||
|
t.Fatalf("got %q want RAM", got)
|
||||||
|
}
|
||||||
|
if got := describeLiveBootSource(LiveBootSource{Kind: "unknown", Source: "/run/live/medium"}); got != "/run/live/medium" {
|
||||||
|
t.Fatalf("got %q want /run/live/medium", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,6 +15,10 @@ var workerPatterns = []string{
|
|||||||
"stress-ng",
|
"stress-ng",
|
||||||
"stressapptest",
|
"stressapptest",
|
||||||
"memtester",
|
"memtester",
|
||||||
|
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||||
|
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||||
|
"nvvs",
|
||||||
|
"dcgmi",
|
||||||
}
|
}
|
||||||
|
|
||||||
// KilledProcess describes a process that was sent SIGKILL.
|
// KilledProcess describes a process that was sent SIGKILL.
|
||||||
|
|||||||
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
job,
|
job,
|
||||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaStressArchivePrefix(loader string) string {
|
func nvidiaStressArchivePrefix(loader string) string {
|
||||||
|
|||||||
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
|
|||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||||
if gpuCmd == nil {
|
if gpuCmd == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||||
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
|||||||
|
|
||||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||||
switch strings.ToLower(vendor) {
|
switch strings.ToLower(vendor) {
|
||||||
case "amd":
|
case "amd":
|
||||||
return buildAMDGPUStressCmd(ctx)
|
return buildAMDGPUStressCmd(ctx, durSec)
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return buildNvidiaGPUStressCmd(ctx)
|
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
rvsArgs, err := resolveRVSCommand()
|
rvsArgs, err := resolveRVSCommand()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
rvsPath := rvsArgs[0]
|
rvsPath := rvsArgs[0]
|
||||||
cfg := `actions:
|
cfg := fmt.Sprintf(`actions:
|
||||||
- name: gst_platform
|
- name: gst_platform
|
||||||
device: all
|
device: all
|
||||||
module: gst
|
module: gst
|
||||||
parallel: true
|
parallel: true
|
||||||
duration: 86400000
|
duration: %d`, durSec*1000) + `
|
||||||
copy_matrix: false
|
copy_matrix: false
|
||||||
target_stress: 90
|
target_stress: 90
|
||||||
matrix_size_a: 8640
|
matrix_size_a: 8640
|
||||||
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
return cmd
|
return cmd
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||||
path, err := satLookPath("bee-gpu-burn")
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
path, err = satLookPath("bee-gpu-stress")
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||||
|
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||||
|
// where the context is cancelled early (user stop, parent timeout).
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
cmd.Stdout = nil
|
cmd.Stdout = nil
|
||||||
cmd.Stderr = nil
|
cmd.Stderr = nil
|
||||||
_ = startLowPriorityCmd(cmd, 10)
|
_ = startLowPriorityCmd(cmd, 10)
|
||||||
|
|||||||
@@ -135,12 +135,15 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
case "nvidia":
|
case "nvidia":
|
||||||
tools = append(tools, s.CheckTools([]string{
|
tools = append(tools, s.CheckTools([]string{
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
|
"dcgmi",
|
||||||
|
"nv-hostengine",
|
||||||
"nvidia-bug-report.sh",
|
"nvidia-bug-report.sh",
|
||||||
"bee-gpu-burn",
|
"bee-gpu-burn",
|
||||||
"bee-john-gpu-stress",
|
"bee-john-gpu-stress",
|
||||||
"bee-nccl-gpu-stress",
|
"bee-nccl-gpu-stress",
|
||||||
"all_reduce_perf",
|
"all_reduce_perf",
|
||||||
})...)
|
})...)
|
||||||
|
tools = append(tools, resolvedToolStatus("dcgmproftester", dcgmProfTesterCandidates...))
|
||||||
case "amd":
|
case "amd":
|
||||||
tool := ToolStatus{Name: "rocm-smi"}
|
tool := ToolStatus{Name: "rocm-smi"}
|
||||||
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
if cmd, err := resolveROCmSMICommand(); err == nil && len(cmd) > 0 {
|
||||||
@@ -155,11 +158,37 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
return tools
|
return tools
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
path, err := exec.LookPath(candidate)
|
||||||
|
if err == nil {
|
||||||
|
return ToolStatus{Name: display, Path: path, OK: true}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ToolStatus{Name: display}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||||
lsmodText := commandText("lsmod")
|
lsmodText := commandText("lsmod")
|
||||||
|
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
|
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||||
|
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||||
|
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_stuck",
|
||||||
|
Severity: "critical",
|
||||||
|
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||||
|
})
|
||||||
|
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||||
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
Code: "nvidia_gsp_disabled",
|
||||||
|
Severity: "warning",
|
||||||
|
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||||
if !health.DriverReady {
|
if !health.DriverReady {
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
|
|||||||
@@ -12,19 +12,20 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"syscall"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
satExecCommand = exec.Command
|
satExecCommand = exec.Command
|
||||||
satLookPath = exec.LookPath
|
satLookPath = exec.LookPath
|
||||||
satGlob = filepath.Glob
|
satGlob = filepath.Glob
|
||||||
satStat = os.Stat
|
satStat = os.Stat
|
||||||
|
satFreeMemBytes = freeMemBytes
|
||||||
|
|
||||||
rocmSMIExecutableGlobs = []string{
|
rocmSMIExecutableGlobs = []string{
|
||||||
"/opt/rocm/bin/rocm-smi",
|
"/opt/rocm/bin/rocm-smi",
|
||||||
@@ -38,6 +39,12 @@ var (
|
|||||||
"/opt/rocm/bin/rvs",
|
"/opt/rocm/bin/rvs",
|
||||||
"/opt/rocm-*/bin/rvs",
|
"/opt/rocm-*/bin/rvs",
|
||||||
}
|
}
|
||||||
|
dcgmProfTesterCandidates = []string{
|
||||||
|
"dcgmproftester",
|
||||||
|
"dcgmproftester13",
|
||||||
|
"dcgmproftester12",
|
||||||
|
"dcgmproftester11",
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
// streamExecOutput runs cmd and streams each output line to logFunc (if non-nil).
|
||||||
@@ -76,15 +83,15 @@ func streamExecOutput(cmd *exec.Cmd, logFunc func(string)) ([]byte, error) {
|
|||||||
|
|
||||||
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
// NvidiaGPU holds basic GPU info from nvidia-smi.
|
||||||
type NvidiaGPU struct {
|
type NvidiaGPU struct {
|
||||||
Index int
|
Index int `json:"index"`
|
||||||
Name string
|
Name string `json:"name"`
|
||||||
MemoryMB int
|
MemoryMB int `json:"memory_mb"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
|
||||||
type AMDGPUInfo struct {
|
type AMDGPUInfo struct {
|
||||||
Index int
|
Index int `json:"index"`
|
||||||
Name string
|
Name string `json:"name"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
// DetectGPUVendor returns "nvidia" if /dev/nvidia0 exists, "amd" if /dev/kfd exists, or "" otherwise.
|
||||||
@@ -256,6 +263,9 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
|||||||
MemoryMB: memMB,
|
MemoryMB: memMB,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
|
return gpus[i].Index < gpus[j].Index
|
||||||
|
})
|
||||||
return gpus, nil
|
return gpus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,13 +278,87 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
if gpuCount < 1 {
|
if gpuCount < 1 {
|
||||||
gpuCount = 1
|
gpuCount = 1
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
}},
|
}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||||
|
satJob{
|
||||||
|
name: "03-dcgmproftester.log",
|
||||||
|
cmd: profCmd,
|
||||||
|
env: nvidiaVisibleDevicesEnv(selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-targeted-power.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-pulse-test.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-nvbandwidth.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
@@ -286,7 +370,68 @@ func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (
|
|||||||
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
// gpuIndices: specific GPU indices to test (empty = all GPUs).
|
||||||
// ctx cancellation kills the running job.
|
// ctx cancellation kills the running job.
|
||||||
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, gpuIndices), logFunc)
|
resolvedGPUIndices, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia", nvidiaDCGMJobs(diagLevel, resolvedGPUIndices), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||||
|
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||||
|
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||||
|
for _, p := range killed {
|
||||||
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||||
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
satJob{
|
||||||
|
name: "02-dcgmi-targeted-stress.log",
|
||||||
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
},
|
||||||
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
return dedupeSortedIndices(gpuIndices), nil
|
||||||
|
}
|
||||||
|
all, err := listNvidiaGPUIndices()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(all) == 0 {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||||
|
}
|
||||||
|
return all, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func memoryStressSizeArg() string {
|
||||||
|
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||||
|
return fmt.Sprintf("%dM", mb)
|
||||||
|
}
|
||||||
|
availBytes := satFreeMemBytes()
|
||||||
|
if availBytes <= 0 {
|
||||||
|
return "80%"
|
||||||
|
}
|
||||||
|
availMB := availBytes / (1024 * 1024)
|
||||||
|
targetMB := (availMB * 2) / 3
|
||||||
|
if targetMB >= 256 {
|
||||||
|
targetMB = (targetMB / 256) * 256
|
||||||
|
}
|
||||||
|
if targetMB <= 0 {
|
||||||
|
return "80%"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dM", targetMB)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
@@ -304,11 +449,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
|
|||||||
if seconds <= 0 {
|
if seconds <= 0 {
|
||||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||||
}
|
}
|
||||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
// Base the default on current MemAvailable and keep headroom for the OS and
|
||||||
sizeArg := "80%"
|
// concurrent stressors so mixed burn runs do not trip the OOM killer.
|
||||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
sizeArg := memoryStressSizeArg()
|
||||||
sizeArg = fmt.Sprintf("%dM", mb)
|
|
||||||
}
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||||
@@ -425,14 +568,24 @@ type satStats struct {
|
|||||||
Unsupported int
|
Unsupported int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||||
|
out := make([]satJob, 0, len(jobs)+1)
|
||||||
|
out = append(out, satJob{
|
||||||
|
name: "00-nvidia-smi-persistence-mode.log",
|
||||||
|
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||||
|
})
|
||||||
|
out = append(out, jobs...)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func nvidiaSATJobs() []satJob {
|
func nvidiaSATJobs() []satJob {
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||||
}
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
@@ -447,11 +600,39 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
|||||||
}
|
}
|
||||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||||
}
|
}
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||||
|
args := []string{"dcgmi", "diag", "-r", name}
|
||||||
|
if durationSec > 0 {
|
||||||
|
args = append(args, "-p", fmt.Sprintf("%s.test_duration=%d", name, durationSec))
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
args = append(args, "-i", joinIndexList(gpuIndices))
|
||||||
|
}
|
||||||
|
return args
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaBurnDuration(durationSec int) int {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 300
|
||||||
|
}
|
||||||
|
return durationSec
|
||||||
|
}
|
||||||
|
|
||||||
|
func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||||||
|
if len(gpuIndices) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -493,6 +674,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return "", ctx.Err()
|
||||||
|
}
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
stats.Add(status)
|
stats.Add(status)
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
@@ -624,6 +808,7 @@ func classifySATResult(name string, out []byte, err error) (string, int) {
|
|||||||
}
|
}
|
||||||
if strings.Contains(text, "unsupported") ||
|
if strings.Contains(text, "unsupported") ||
|
||||||
strings.Contains(text, "not supported") ||
|
strings.Contains(text, "not supported") ||
|
||||||
|
strings.Contains(text, "not found in path") ||
|
||||||
strings.Contains(text, "invalid opcode") ||
|
strings.Contains(text, "invalid opcode") ||
|
||||||
strings.Contains(text, "unknown command") ||
|
strings.Contains(text, "unknown command") ||
|
||||||
strings.Contains(text, "not implemented") ||
|
strings.Contains(text, "not implemented") ||
|
||||||
@@ -730,6 +915,15 @@ func resolveROCmSMICommand(args ...string) ([]string, error) {
|
|||||||
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
return nil, errors.New("rocm-smi not found in PATH or under /opt/rocm")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolveDCGMProfTesterCommand(args ...string) ([]string, error) {
|
||||||
|
for _, candidate := range dcgmProfTesterCandidates {
|
||||||
|
if path, err := satLookPath(candidate); err == nil {
|
||||||
|
return append([]string{path}, args...), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, errors.New("dcgmproftester not found in PATH")
|
||||||
|
}
|
||||||
|
|
||||||
func ensureAMDRuntimeReady() error {
|
func ensureAMDRuntimeReady() error {
|
||||||
if _, err := os.Stat("/dev/kfd"); err == nil {
|
if _, err := os.Stat("/dev/kfd"); err == nil {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestStorageSATCommands(t *testing.T) {
|
func TestStorageSATCommands(t *testing.T) {
|
||||||
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
|
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
|
|
||||||
if len(jobs) != 5 {
|
if len(jobs) != 6 {
|
||||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||||
}
|
}
|
||||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||||
|
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||||
}
|
}
|
||||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
|||||||
|
|
||||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
got := jobs[4].cmd
|
got := jobs[5].cmd
|
||||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||||
if len(got) != len(want) {
|
if len(got) != len(want) {
|
||||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||||
|
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||||
|
if len(jobs) != 5 {
|
||||||
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||||
|
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -162,6 +183,89 @@ func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMGPUIndicesUsesDetectedGPUsWhenUnset(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '2\n0\n1\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
got, err := resolveDCGMGPUIndices(nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||||
|
}
|
||||||
|
if want := "0,1,2"; joinIndexList(got) != want {
|
||||||
|
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMGPUIndicesKeepsExplicitSelection(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got, err := resolveDCGMGPUIndices([]int{3, 1, 3})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMGPUIndices error: %v", err)
|
||||||
|
}
|
||||||
|
if want := "1,3"; joinIndexList(got) != want {
|
||||||
|
t.Fatalf("gpuIndices=%q want %q", joinIndexList(got), want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolveDCGMProfTesterCommandUsesVersionedBinary(t *testing.T) {
|
||||||
|
oldLookPath := satLookPath
|
||||||
|
satLookPath = func(file string) (string, error) {
|
||||||
|
switch file {
|
||||||
|
case "dcgmproftester13":
|
||||||
|
return "/usr/bin/dcgmproftester13", nil
|
||||||
|
default:
|
||||||
|
return "", exec.ErrNotFound
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||||
|
|
||||||
|
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("resolveDCGMProfTesterCommand error: %v", err)
|
||||||
|
}
|
||||||
|
if len(cmd) != 4 {
|
||||||
|
t.Fatalf("cmd len=%d want 4 (%v)", len(cmd), cmd)
|
||||||
|
}
|
||||||
|
if cmd[0] != "/usr/bin/dcgmproftester13" {
|
||||||
|
t.Fatalf("cmd[0]=%q want /usr/bin/dcgmproftester13", cmd[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||||
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", 900, []int{3, 1})
|
||||||
|
want := []string{"dcgmi", "diag", "-r", "targeted_power", "-p", "targeted_power.test_duration=900", "-i", "3,1"}
|
||||||
|
if len(cmd) != len(want) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(cmd), len(want), cmd)
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if cmd[i] != want[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, cmd[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||||
|
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||||
|
if len(env) != 2 {
|
||||||
|
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||||
|
}
|
||||||
|
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||||
|
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||||
|
}
|
||||||
|
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||||
|
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -196,6 +300,37 @@ func TestEnvIntFallback(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "65536M" {
|
||||||
|
t.Fatalf("sizeArg=%q want 65536M", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "4096M" {
|
||||||
|
t.Fatalf("sizeArg=%q want 4096M", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
|
||||||
|
oldFreeMemBytes := satFreeMemBytes
|
||||||
|
satFreeMemBytes = func() int64 { return 0 }
|
||||||
|
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||||
|
|
||||||
|
if got := memoryStressSizeArg(); got != "80%" {
|
||||||
|
t.Fatalf("sizeArg=%q want 80%%", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestClassifySATResult(t *testing.T) {
|
func TestClassifySATResult(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
@@ -220,6 +355,38 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
t.Cleanup(cancel)
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||||
|
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||||
|
}, nil)
|
||||||
|
<-done
|
||||||
|
|
||||||
|
if !errors.Is(err, context.Canceled) {
|
||||||
|
t.Fatalf("err=%v want context.Canceled", err)
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
t.Fatalf("archive=%q want empty", archive)
|
||||||
|
}
|
||||||
|
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||||
|
if globErr != nil {
|
||||||
|
t.Fatalf("Glob error: %v", globErr)
|
||||||
|
}
|
||||||
|
if len(matches) != 0 {
|
||||||
|
t.Fatalf("archives=%v want none", matches)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -10,17 +10,30 @@ import (
|
|||||||
func (s *System) ListBeeServices() ([]string, error) {
|
func (s *System) ListBeeServices() ([]string, error) {
|
||||||
seen := map[string]bool{}
|
seen := map[string]bool{}
|
||||||
var out []string
|
var out []string
|
||||||
for _, pattern := range []string{"/etc/systemd/system/bee-*.service", "/lib/systemd/system/bee-*.service"} {
|
for _, pattern := range []string{
|
||||||
|
"/etc/systemd/system/bee-*.service",
|
||||||
|
"/lib/systemd/system/bee-*.service",
|
||||||
|
"/etc/systemd/system/bee-*.timer",
|
||||||
|
"/lib/systemd/system/bee-*.timer",
|
||||||
|
} {
|
||||||
matches, err := filepath.Glob(pattern)
|
matches, err := filepath.Glob(pattern)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
name := strings.TrimSuffix(filepath.Base(match), ".service")
|
base := filepath.Base(match)
|
||||||
|
name := base
|
||||||
|
if strings.HasSuffix(base, ".service") {
|
||||||
|
name = strings.TrimSuffix(base, ".service")
|
||||||
|
}
|
||||||
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
// Skip template units (e.g. bee-journal-mirror@) — they have no instances to query.
|
||||||
if strings.HasSuffix(name, "@") {
|
if strings.HasSuffix(name, "@") {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// bee-selfheal is timer-managed; showing the oneshot service as inactive is misleading.
|
||||||
|
if name == "bee-selfheal" && strings.HasSuffix(base, ".service") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if !seen[name] {
|
if !seen[name] {
|
||||||
seen[name] = true
|
seen[name] = true
|
||||||
out = append(out, name)
|
out = append(out, name)
|
||||||
@@ -48,7 +61,9 @@ func (s *System) ServiceState(name string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
// bee-web runs as the bee user; sudo is required to control system services.
|
||||||
|
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||||
|
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||||
return string(raw), err
|
return string(raw), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -44,12 +44,12 @@ type StaticIPv4Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type RemovableTarget struct {
|
type RemovableTarget struct {
|
||||||
Device string
|
Device string `json:"device"`
|
||||||
FSType string
|
FSType string `json:"fs_type"`
|
||||||
Size string
|
Size string `json:"size"`
|
||||||
Label string
|
Label string `json:"label"`
|
||||||
Model string
|
Model string `json:"model"`
|
||||||
Mountpoint string
|
Mountpoint string `json:"mountpoint"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ToolStatus struct {
|
type ToolStatus struct {
|
||||||
|
|||||||
31
audit/internal/platform/types_test.go
Normal file
31
audit/internal/platform/types_test.go
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRemovableTargetJSONUsesFrontendFieldNames(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
data, err := json.Marshal(RemovableTarget{
|
||||||
|
Device: "/dev/sdb1",
|
||||||
|
FSType: "exfat",
|
||||||
|
Size: "1.8T",
|
||||||
|
Label: "USB",
|
||||||
|
Model: "Flash",
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("marshal: %v", err)
|
||||||
|
}
|
||||||
|
raw := string(data)
|
||||||
|
for _, key := range []string{`"device"`, `"fs_type"`, `"size"`, `"label"`, `"model"`} {
|
||||||
|
if !strings.Contains(raw, key) {
|
||||||
|
t.Fatalf("json missing key %s: %s", key, raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.Contains(raw, `"Device"`) || strings.Contains(raw, `"FSType"`) {
|
||||||
|
t.Fatalf("json still contains Go field names: %s", raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
|
|||||||
ExportDir string `json:"export_dir,omitempty"`
|
ExportDir string `json:"export_dir,omitempty"`
|
||||||
DriverReady bool `json:"driver_ready,omitempty"`
|
DriverReady bool `json:"driver_ready,omitempty"`
|
||||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||||
|
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||||
NetworkStatus string `json:"network_status,omitempty"`
|
NetworkStatus string `json:"network_status,omitempty"`
|
||||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -21,13 +22,238 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||||
|
var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
if a == nil {
|
||||||
|
return nil, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
return a.ListNvidiaGPUs()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var jobCounter atomic.Uint64
|
var jobCounter atomic.Uint64
|
||||||
|
|
||||||
func newJobID(prefix string) string {
|
func newJobID(_ string) string {
|
||||||
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
start := int((jobCounter.Add(1) - 1) % 1000)
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
for offset := 0; offset < 1000; offset++ {
|
||||||
|
n := (start + offset) % 1000
|
||||||
|
id := fmt.Sprintf("TASK-%03d", n)
|
||||||
|
if !taskIDInUseLocked(id) {
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("TASK-%03d", start)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskIDInUseLocked(id string) bool {
|
||||||
|
for _, t := range globalQueue.tasks {
|
||||||
|
if t != nil && t.ID == id {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskRunResponse struct {
|
||||||
|
TaskID string `json:"task_id,omitempty"`
|
||||||
|
JobID string `json:"job_id,omitempty"`
|
||||||
|
TaskIDs []string `json:"task_ids,omitempty"`
|
||||||
|
JobIDs []string `json:"job_ids,omitempty"`
|
||||||
|
TaskCount int `json:"task_count,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaTaskSelection struct {
|
||||||
|
GPUIndices []int
|
||||||
|
Label string
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||||
|
if len(tasks) == 0 {
|
||||||
|
writeJSON(w, taskRunResponse{})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ids := make([]string, 0, len(tasks))
|
||||||
|
for _, t := range tasks {
|
||||||
|
if t == nil || strings.TrimSpace(t.ID) == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, t.ID)
|
||||||
|
}
|
||||||
|
resp := taskRunResponse{TaskCount: len(ids)}
|
||||||
|
if len(ids) > 0 {
|
||||||
|
resp.TaskID = ids[0]
|
||||||
|
resp.JobID = ids[0]
|
||||||
|
resp.TaskIDs = ids
|
||||||
|
resp.JobIDs = ids
|
||||||
|
}
|
||||||
|
writeJSON(w, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||||
|
switch strings.TrimSpace(target) {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||||
|
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||||
|
"nvidia-bandwidth", "nvidia-stress":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||||
|
}
|
||||||
|
indexed := make(map[int]platform.NvidiaGPU, len(gpus))
|
||||||
|
allIndices := make([]int, 0, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
indexed[gpu.Index] = gpu
|
||||||
|
allIndices = append(allIndices, gpu.Index)
|
||||||
|
}
|
||||||
|
sort.Ints(allIndices)
|
||||||
|
|
||||||
|
selected := allIndices
|
||||||
|
if len(include) > 0 {
|
||||||
|
selected = make([]int, 0, len(include))
|
||||||
|
seen := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
if _, ok := indexed[idx]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, dup := seen[idx]; dup {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[idx] = struct{}{}
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(selected)
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||||
|
}
|
||||||
|
|
||||||
|
modelGroups := make(map[string][]platform.NvidiaGPU)
|
||||||
|
modelOrder := make([]string, 0)
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpu := indexed[idx]
|
||||||
|
model := strings.TrimSpace(gpu.Name)
|
||||||
|
if model == "" {
|
||||||
|
model = fmt.Sprintf("GPU %d", gpu.Index)
|
||||||
|
}
|
||||||
|
if _, ok := modelGroups[model]; !ok {
|
||||||
|
modelOrder = append(modelOrder, model)
|
||||||
|
}
|
||||||
|
modelGroups[model] = append(modelGroups[model], gpu)
|
||||||
|
}
|
||||||
|
sort.Slice(modelOrder, func(i, j int) bool {
|
||||||
|
left := modelGroups[modelOrder[i]]
|
||||||
|
right := modelGroups[modelOrder[j]]
|
||||||
|
if len(left) == 0 || len(right) == 0 {
|
||||||
|
return modelOrder[i] < modelOrder[j]
|
||||||
|
}
|
||||||
|
return left[0].Index < right[0].Index
|
||||||
|
})
|
||||||
|
|
||||||
|
var groups []nvidiaTaskSelection
|
||||||
|
var singles []nvidiaTaskSelection
|
||||||
|
for _, model := range modelOrder {
|
||||||
|
group := modelGroups[model]
|
||||||
|
sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
|
||||||
|
indices := make([]int, 0, len(group))
|
||||||
|
for _, gpu := range group {
|
||||||
|
indices = append(indices, gpu.Index)
|
||||||
|
}
|
||||||
|
if len(indices) >= 2 {
|
||||||
|
groups = append(groups, nvidiaTaskSelection{
|
||||||
|
GPUIndices: indices,
|
||||||
|
Label: fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpu := group[0]
|
||||||
|
singles = append(singles, nvidiaTaskSelection{
|
||||||
|
GPUIndices: []int{gpu.Index},
|
||||||
|
Label: fmt.Sprintf("GPU %d — %s", gpu.Index, model),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return append(groups, singles...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinTaskIndices(indices []int) string {
|
||||||
|
parts := make([]string, 0, len(indices))
|
||||||
|
for _, idx := range indices {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d", idx))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||||
|
baseName = strings.TrimSpace(baseName)
|
||||||
|
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||||
|
if baseName == "" {
|
||||||
|
return selectionLabel
|
||||||
|
}
|
||||||
|
if selectionLabel == "" {
|
||||||
|
return baseName
|
||||||
|
}
|
||||||
|
return baseName + " (" + selectionLabel + ")"
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||||
|
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: baseName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: params,
|
||||||
|
}
|
||||||
|
return []*Task{t}, nil
|
||||||
|
}
|
||||||
|
gpus, err := apiListNvidiaGPUs(appRef)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tasks := make([]*Task, 0, len(selections))
|
||||||
|
for _, selection := range selections {
|
||||||
|
taskParamsCopy := params
|
||||||
|
taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
|
||||||
|
taskParamsCopy.ExcludeGPUIndices = nil
|
||||||
|
displayName := formatSplitTaskName(baseName, selection.Label)
|
||||||
|
taskParamsCopy.DisplayName = displayName
|
||||||
|
tasks = append(tasks, &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: displayName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: taskParamsCopy,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return tasks, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||||
@@ -110,6 +336,11 @@ func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
|
|||||||
|
|
||||||
scanDone := make(chan error, 1)
|
scanDone := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
scanDone <- fmt.Errorf("stream scanner panic: %v", rec)
|
||||||
|
}
|
||||||
|
}()
|
||||||
scanner := bufio.NewScanner(pr)
|
scanner := bufio.NewScanner(pr)
|
||||||
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
@@ -202,31 +433,84 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||||
t := &Task{
|
|
||||||
ID: newJobID("sat-" + target),
|
|
||||||
Name: name,
|
|
||||||
Target: target,
|
|
||||||
Status: TaskPending,
|
|
||||||
CreatedAt: time.Now(),
|
|
||||||
params: taskParams{
|
|
||||||
Duration: body.Duration,
|
|
||||||
DiagLevel: body.DiagLevel,
|
|
||||||
GPUIndices: body.GPUIndices,
|
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
|
||||||
Loader: body.Loader,
|
|
||||||
BurnProfile: body.Profile,
|
|
||||||
DisplayName: body.DisplayName,
|
|
||||||
PlatformComponents: body.PlatformComponents,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
t.Name = body.DisplayName
|
name = body.DisplayName
|
||||||
}
|
}
|
||||||
globalQueue.enqueue(t)
|
params := taskParams{
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
Duration: body.Duration,
|
||||||
|
DiagLevel: body.DiagLevel,
|
||||||
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
Loader: body.Loader,
|
||||||
|
BurnProfile: body.Profile,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
PlatformComponents: body.PlatformComponents,
|
||||||
|
}
|
||||||
|
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var body struct {
|
||||||
|
Profile string `json:"profile"`
|
||||||
|
SizeMB int `json:"size_mb"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
|
RunNCCL *bool `json:"run_nccl"`
|
||||||
|
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||||
|
DisplayName string `json:"display_name"`
|
||||||
|
}
|
||||||
|
if r.Body != nil {
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
runNCCL := true
|
||||||
|
if body.RunNCCL != nil {
|
||||||
|
runNCCL = *body.RunNCCL
|
||||||
|
}
|
||||||
|
parallelGPUs := false
|
||||||
|
if body.ParallelGPUs != nil {
|
||||||
|
parallelGPUs = *body.ParallelGPUs
|
||||||
|
}
|
||||||
|
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||||
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
|
name = body.DisplayName
|
||||||
|
}
|
||||||
|
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||||
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
BenchmarkProfile: body.Profile,
|
||||||
|
RunNCCL: runNCCL,
|
||||||
|
ParallelGPUs: parallelGPUs,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
}, name, h.opts.App, "benchmark-nvidia")
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||||
id := r.URL.Query().Get("job_id")
|
id := r.URL.Query().Get("job_id")
|
||||||
if id == "" {
|
if id == "" {
|
||||||
@@ -330,11 +614,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
||||||
|
status := "ok"
|
||||||
if err != nil {
|
if err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, err.Error())
|
status = "error"
|
||||||
return
|
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
// Always return 200 with output so the frontend can display the actual
|
||||||
|
// systemctl error message instead of a generic "exit status 1".
|
||||||
|
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Network ───────────────────────────────────────────────────────────────────
|
// ── Network ───────────────────────────────────────────────────────────────────
|
||||||
@@ -486,6 +772,22 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
|||||||
|
|
||||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
gpus, err := h.opts.App.ListNvidiaGPUs()
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if gpus == nil {
|
||||||
|
gpus = []platform.NvidiaGPU{}
|
||||||
|
}
|
||||||
|
writeJSON(w, gpus)
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||||
if h.opts.App == nil {
|
if h.opts.App == nil {
|
||||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
@@ -511,14 +813,33 @@ func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
|
|||||||
_, amdErr := os.Stat("/dev/kfd")
|
_, amdErr := os.Stat("/dev/kfd")
|
||||||
nvidiaUp := nvidiaErr == nil
|
nvidiaUp := nvidiaErr == nil
|
||||||
amdUp := amdErr == nil
|
amdUp := amdErr == nil
|
||||||
|
_, dcgmErr := exec.LookPath("dcgmi")
|
||||||
|
_, ncclStressErr := exec.LookPath("bee-nccl-gpu-stress")
|
||||||
|
_, johnErr := exec.LookPath("bee-john-gpu-stress")
|
||||||
|
_, beeBurnErr := exec.LookPath("bee-gpu-burn")
|
||||||
|
_, nvBandwidthErr := exec.LookPath("nvbandwidth")
|
||||||
|
profErr := lookPathAny("dcgmproftester", "dcgmproftester13", "dcgmproftester12", "dcgmproftester11")
|
||||||
writeJSON(w, []toolEntry{
|
writeJSON(w, []toolEntry{
|
||||||
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
|
{ID: "nvidia-compute", Available: nvidiaUp && profErr == nil, Vendor: "nvidia"},
|
||||||
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
|
{ID: "nvidia-targeted-power", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
|
||||||
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
|
{ID: "nvidia-pulse", Available: nvidiaUp && dcgmErr == nil, Vendor: "nvidia"},
|
||||||
|
{ID: "nvidia-interconnect", Available: nvidiaUp && ncclStressErr == nil, Vendor: "nvidia"},
|
||||||
|
{ID: "nvidia-bandwidth", Available: nvidiaUp && dcgmErr == nil && nvBandwidthErr == nil, Vendor: "nvidia"},
|
||||||
|
{ID: "bee-gpu-burn", Available: nvidiaUp && beeBurnErr == nil, Vendor: "nvidia"},
|
||||||
|
{ID: "john", Available: nvidiaUp && johnErr == nil, Vendor: "nvidia"},
|
||||||
{ID: "rvs", Available: amdUp, Vendor: "amd"},
|
{ID: "rvs", Available: amdUp, Vendor: "amd"},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func lookPathAny(names ...string) error {
|
||||||
|
for _, name := range names {
|
||||||
|
if _, err := exec.LookPath(name); err == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return exec.ErrNotFound
|
||||||
|
}
|
||||||
|
|
||||||
// ── System ────────────────────────────────────────────────────────────────────
|
// ── System ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -557,7 +878,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
|||||||
|
|
||||||
var standardTools = []string{
|
var standardTools = []string{
|
||||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||||
"nvidia-smi", "memtester", "stress-ng", "nvtop",
|
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||||
"mstflint", "qrencode",
|
"mstflint", "qrencode",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -64,6 +65,141 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 1 {
|
||||||
|
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
task := globalQueue.tasks[0]
|
||||||
|
if task.Target != "nvidia-benchmark" {
|
||||||
|
t.Fatalf("target=%q want nvidia-benchmark", task.Target)
|
||||||
|
}
|
||||||
|
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
|
||||||
|
t.Fatalf("gpu indices=%v want [1 3]", got)
|
||||||
|
}
|
||||||
|
if task.params.RunNCCL {
|
||||||
|
t.Fatal("RunNCCL should reflect explicit false from request")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var resp taskRunResponse
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||||
|
t.Fatalf("decode response: %v", err)
|
||||||
|
}
|
||||||
|
if len(resp.TaskIDs) != 2 {
|
||||||
|
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
h := &handler{}
|
h := &handler{}
|
||||||
|
|||||||
773
audit/internal/webui/charts_svg.go
Normal file
773
audit/internal/webui/charts_svg.go
Normal file
@@ -0,0 +1,773 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
type chartTimelineSegment struct {
|
||||||
|
Start time.Time
|
||||||
|
End time.Time
|
||||||
|
Active bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type chartScale struct {
|
||||||
|
Min float64
|
||||||
|
Max float64
|
||||||
|
Ticks []float64
|
||||||
|
}
|
||||||
|
|
||||||
|
type chartLayout struct {
|
||||||
|
Width int
|
||||||
|
Height int
|
||||||
|
PlotLeft int
|
||||||
|
PlotRight int
|
||||||
|
PlotTop int
|
||||||
|
PlotBottom int
|
||||||
|
}
|
||||||
|
|
||||||
|
type metricChartSeries struct {
|
||||||
|
Name string
|
||||||
|
AxisTitle string
|
||||||
|
Color string
|
||||||
|
Values []float64
|
||||||
|
}
|
||||||
|
|
||||||
|
var metricChartPalette = []string{
|
||||||
|
"#5794f2",
|
||||||
|
"#73bf69",
|
||||||
|
"#f2cc0c",
|
||||||
|
"#ff9830",
|
||||||
|
"#f2495c",
|
||||||
|
"#b877d9",
|
||||||
|
"#56d2f7",
|
||||||
|
"#8ab8ff",
|
||||||
|
"#9adf8f",
|
||||||
|
"#ffbe5c",
|
||||||
|
}
|
||||||
|
|
||||||
|
var gpuLabelCache struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
loadedAt time.Time
|
||||||
|
byIndex map[int]string
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{time.Time{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range datasets {
|
||||||
|
if len(datasets[i]) == 0 {
|
||||||
|
datasets[i] = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
statsLabel := chartStatsLabel(datasets)
|
||||||
|
|
||||||
|
legendItems := []metricChartSeries{}
|
||||||
|
for i, name := range names {
|
||||||
|
color := metricChartPalette[i%len(metricChartPalette)]
|
||||||
|
values := make([]float64, pointCount)
|
||||||
|
if i < len(datasets) {
|
||||||
|
copy(values, coalesceDataset(datasets[i], pointCount))
|
||||||
|
}
|
||||||
|
legendItems = append(legendItems, metricChartSeries{
|
||||||
|
Name: name,
|
||||||
|
Color: color,
|
||||||
|
Values: values,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
scale := singleAxisChartScale(datasets, yMin, yMax)
|
||||||
|
layout := singleAxisChartLayout(canvasHeight, len(legendItems))
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||||
|
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scale)
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
writeSingleAxisY(&b, layout, scale)
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
for _, item := range legendItems {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, item.Values, scale, item.Color)
|
||||||
|
}
|
||||||
|
writeLegend(&b, layout, legendItems)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) ([]byte, bool, error) {
|
||||||
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
|
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
|
if temp == nil && power == nil && coreClock == nil {
|
||||||
|
return nil, false, nil
|
||||||
|
}
|
||||||
|
labels := sampleTimeLabels(samples)
|
||||||
|
times := sampleTimes(samples)
|
||||||
|
svg, err := drawGPUOverviewChartSVG(
|
||||||
|
gpuDisplayLabel(idx)+" Overview",
|
||||||
|
labels,
|
||||||
|
times,
|
||||||
|
[]metricChartSeries{
|
||||||
|
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
|
||||||
|
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
|
||||||
|
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
|
||||||
|
},
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, false, err
|
||||||
|
}
|
||||||
|
return svg, true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
|
||||||
|
if len(series) != 3 {
|
||||||
|
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
|
||||||
|
}
|
||||||
|
const (
|
||||||
|
width = 1400
|
||||||
|
height = 840
|
||||||
|
plotLeft = 180
|
||||||
|
plotRight = 1220
|
||||||
|
plotTop = 96
|
||||||
|
plotBottom = 660
|
||||||
|
)
|
||||||
|
const (
|
||||||
|
leftOuterAxis = 72
|
||||||
|
leftInnerAxis = 132
|
||||||
|
rightInnerAxis = 1268
|
||||||
|
)
|
||||||
|
layout := chartLayout{
|
||||||
|
Width: width,
|
||||||
|
Height: height,
|
||||||
|
PlotLeft: plotLeft,
|
||||||
|
PlotRight: plotRight,
|
||||||
|
PlotTop: plotTop,
|
||||||
|
PlotBottom: plotBottom,
|
||||||
|
}
|
||||||
|
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
if pointCount == 0 {
|
||||||
|
pointCount = 1
|
||||||
|
labels = []string{""}
|
||||||
|
times = []time.Time{time.Time{}}
|
||||||
|
}
|
||||||
|
if len(labels) < pointCount {
|
||||||
|
padded := make([]string, pointCount)
|
||||||
|
copy(padded, labels)
|
||||||
|
labels = padded
|
||||||
|
}
|
||||||
|
if len(times) < pointCount {
|
||||||
|
times = synthesizeChartTimes(times, pointCount)
|
||||||
|
}
|
||||||
|
for i := range series {
|
||||||
|
if len(series[i].Values) == 0 {
|
||||||
|
series[i].Values = make([]float64, pointCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
scales := make([]chartScale, len(series))
|
||||||
|
for i := range series {
|
||||||
|
min, max := chartSeriesBounds(series[i].Values)
|
||||||
|
ticks := chartNiceTicks(min, max, 8)
|
||||||
|
scales[i] = chartScale{
|
||||||
|
Min: ticks[0],
|
||||||
|
Max: ticks[len(ticks)-1],
|
||||||
|
Ticks: ticks,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
writeSVGOpen(&b, width, height)
|
||||||
|
writeChartFrame(&b, title, "", width, height)
|
||||||
|
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||||
|
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||||
|
writeHorizontalGrid(&b, layout, scales[0])
|
||||||
|
writeTimelineBoundaries(&b, layout, start, end, timeline)
|
||||||
|
writePlotBorder(&b, layout)
|
||||||
|
|
||||||
|
for i, axisLineX := range axisX {
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, layout.PlotTop, axisLineX, layout.PlotBottom, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="700" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX, 64, series[i].Color, sanitizeChartText(series[i].AxisTitle))
|
||||||
|
for _, tick := range scales[i].Ticks {
|
||||||
|
y := chartYForValue(valueClamp(tick, scales[i]), scales[i], layout.PlotTop, layout.PlotBottom)
|
||||||
|
label := sanitizeChartText(chartYAxisNumber(tick))
|
||||||
|
if i < 2 {
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, y, axisLineX+6, y, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX-8, y, series[i].Color, label)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="%s" stroke-width="1"/>`+"\n",
|
||||||
|
axisLineX, y, axisLineX-6, y, series[i].Color)
|
||||||
|
fmt.Fprintf(&b, `<text x="%d" y="%.1f" text-anchor="start" dy="4" font-family="sans-serif" font-size="10" fill="%s">%s</text>`+"\n",
|
||||||
|
axisLineX+8, y, series[i].Color, label)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writeXAxisLabels(&b, layout, times, labels, start, end, 8)
|
||||||
|
for i := range series {
|
||||||
|
writeSeriesPolyline(&b, layout, times, start, end, series[i].Values, scales[i], series[i].Color)
|
||||||
|
}
|
||||||
|
writeLegend(&b, layout, series)
|
||||||
|
writeSVGClose(&b)
|
||||||
|
return []byte(b.String()), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func metricsTimelineSegments(samples []platform.LiveMetricSample, now time.Time) []chartTimelineSegment {
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
times := sampleTimes(samples)
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return chartTimelineSegmentsForRange(start, end, now, snapshotTaskHistory())
|
||||||
|
}
|
||||||
|
|
||||||
|
func snapshotTaskHistory() []Task {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
out := make([]Task, len(globalQueue.tasks))
|
||||||
|
for i, t := range globalQueue.tasks {
|
||||||
|
out[i] = *t
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartTimelineSegmentsForRange(start, end, now time.Time, tasks []Task) []chartTimelineSegment {
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
start, end = end, start
|
||||||
|
}
|
||||||
|
type interval struct {
|
||||||
|
start time.Time
|
||||||
|
end time.Time
|
||||||
|
}
|
||||||
|
active := make([]interval, 0, len(tasks))
|
||||||
|
for _, task := range tasks {
|
||||||
|
if task.StartedAt == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
intervalStart := task.StartedAt.UTC()
|
||||||
|
intervalEnd := now.UTC()
|
||||||
|
if task.DoneAt != nil {
|
||||||
|
intervalEnd = task.DoneAt.UTC()
|
||||||
|
}
|
||||||
|
if !intervalEnd.After(intervalStart) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if intervalEnd.Before(start) || intervalStart.After(end) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if intervalStart.Before(start) {
|
||||||
|
intervalStart = start
|
||||||
|
}
|
||||||
|
if intervalEnd.After(end) {
|
||||||
|
intervalEnd = end
|
||||||
|
}
|
||||||
|
active = append(active, interval{start: intervalStart, end: intervalEnd})
|
||||||
|
}
|
||||||
|
sort.Slice(active, func(i, j int) bool {
|
||||||
|
if active[i].start.Equal(active[j].start) {
|
||||||
|
return active[i].end.Before(active[j].end)
|
||||||
|
}
|
||||||
|
return active[i].start.Before(active[j].start)
|
||||||
|
})
|
||||||
|
merged := make([]interval, 0, len(active))
|
||||||
|
for _, span := range active {
|
||||||
|
if len(merged) == 0 {
|
||||||
|
merged = append(merged, span)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
last := &merged[len(merged)-1]
|
||||||
|
if !span.start.After(last.end) {
|
||||||
|
if span.end.After(last.end) {
|
||||||
|
last.end = span.end
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
merged = append(merged, span)
|
||||||
|
}
|
||||||
|
|
||||||
|
segments := make([]chartTimelineSegment, 0, len(merged)*2+1)
|
||||||
|
cursor := start
|
||||||
|
for _, span := range merged {
|
||||||
|
if span.start.After(cursor) {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: cursor, End: span.start, Active: false})
|
||||||
|
}
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: span.start, End: span.end, Active: true})
|
||||||
|
cursor = span.end
|
||||||
|
}
|
||||||
|
if cursor.Before(end) {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: cursor, End: end, Active: false})
|
||||||
|
}
|
||||||
|
if len(segments) == 0 {
|
||||||
|
segments = append(segments, chartTimelineSegment{Start: start, End: end, Active: false})
|
||||||
|
}
|
||||||
|
return segments
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleTimes(samples []platform.LiveMetricSample) []time.Time {
|
||||||
|
times := make([]time.Time, 0, len(samples))
|
||||||
|
for _, sample := range samples {
|
||||||
|
times = append(times, sample.Timestamp)
|
||||||
|
}
|
||||||
|
return times
|
||||||
|
}
|
||||||
|
|
||||||
|
func singleAxisChartScale(datasets [][]float64, yMin, yMax *float64) chartScale {
|
||||||
|
min, max := 0.0, 1.0
|
||||||
|
if yMin != nil && yMax != nil {
|
||||||
|
min, max = *yMin, *yMax
|
||||||
|
} else {
|
||||||
|
min, max = chartSeriesBounds(flattenDatasets(datasets))
|
||||||
|
if yMin != nil {
|
||||||
|
min = *yMin
|
||||||
|
}
|
||||||
|
if yMax != nil {
|
||||||
|
max = *yMax
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ticks := chartNiceTicks(min, max, 8)
|
||||||
|
return chartScale{Min: ticks[0], Max: ticks[len(ticks)-1], Ticks: ticks}
|
||||||
|
}
|
||||||
|
|
||||||
|
func flattenDatasets(datasets [][]float64) []float64 {
|
||||||
|
total := 0
|
||||||
|
for _, ds := range datasets {
|
||||||
|
total += len(ds)
|
||||||
|
}
|
||||||
|
out := make([]float64, 0, total)
|
||||||
|
for _, ds := range datasets {
|
||||||
|
out = append(out, ds...)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func singleAxisChartLayout(canvasHeight int, seriesCount int) chartLayout {
|
||||||
|
legendRows := 0
|
||||||
|
if chartLegendVisible(seriesCount) && seriesCount > 0 {
|
||||||
|
cols := 4
|
||||||
|
if seriesCount < cols {
|
||||||
|
cols = seriesCount
|
||||||
|
}
|
||||||
|
legendRows = (seriesCount + cols - 1) / cols
|
||||||
|
}
|
||||||
|
legendHeight := 0
|
||||||
|
if legendRows > 0 {
|
||||||
|
legendHeight = legendRows*24 + 24
|
||||||
|
}
|
||||||
|
return chartLayout{
|
||||||
|
Width: 1400,
|
||||||
|
Height: canvasHeight,
|
||||||
|
PlotLeft: 96,
|
||||||
|
PlotRight: 1352,
|
||||||
|
PlotTop: 72,
|
||||||
|
PlotBottom: canvasHeight - 60 - legendHeight,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartTimeBounds(times []time.Time) (time.Time, time.Time) {
|
||||||
|
if len(times) == 0 {
|
||||||
|
return time.Time{}, time.Time{}
|
||||||
|
}
|
||||||
|
start := times[0].UTC()
|
||||||
|
end := start
|
||||||
|
for _, ts := range times[1:] {
|
||||||
|
t := ts.UTC()
|
||||||
|
if t.Before(start) {
|
||||||
|
start = t
|
||||||
|
}
|
||||||
|
if t.After(end) {
|
||||||
|
end = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return start, end
|
||||||
|
}
|
||||||
|
|
||||||
|
func synthesizeChartTimes(times []time.Time, count int) []time.Time {
|
||||||
|
if count <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if len(times) == count {
|
||||||
|
return times
|
||||||
|
}
|
||||||
|
if len(times) == 1 {
|
||||||
|
out := make([]time.Time, count)
|
||||||
|
for i := range out {
|
||||||
|
out[i] = times[0].Add(time.Duration(i) * time.Minute)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
base := time.Now().UTC().Add(-time.Duration(count-1) * time.Minute)
|
||||||
|
out := make([]time.Time, count)
|
||||||
|
for i := range out {
|
||||||
|
out[i] = base.Add(time.Duration(i) * time.Minute)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSVGOpen(b *strings.Builder, width, height int) {
|
||||||
|
fmt.Fprintf(b, `<svg xmlns="http://www.w3.org/2000/svg" width="%d" height="%d" viewBox="0 0 %d %d">`+"\n", width, height, width, height)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSVGClose(b *strings.Builder) {
|
||||||
|
b.WriteString("</svg>\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||||
|
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||||
|
width/2, sanitizeChartText(title))
|
||||||
|
if strings.TrimSpace(subtitle) != "" {
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||||
|
width/2, sanitizeChartText(subtitle))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||||
|
fmt.Fprintf(b, `<rect x="%d" y="%d" width="%d" height="%d" fill="none" stroke="#cbd5e1" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, layout.PlotTop, layout.PlotRight-layout.PlotLeft, layout.PlotBottom-layout.PlotTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeHorizontalGrid(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||||
|
b.WriteString(`<g stroke="#e2e8f0" stroke-width="1">` + "\n")
|
||||||
|
for _, tick := range scale.Ticks {
|
||||||
|
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f"/>`+"\n",
|
||||||
|
layout.PlotLeft, y, layout.PlotRight, y)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeVerticalGrid(b *strings.Builder, layout chartLayout, times []time.Time, pointCount, target int) {
|
||||||
|
if pointCount <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
start, end := chartTimeBounds(times)
|
||||||
|
b.WriteString(`<g stroke="#edf2f7" stroke-width="1">` + "\n")
|
||||||
|
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||||
|
ts := chartPointTime(times, idx)
|
||||||
|
x := chartXForTime(ts, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
fmt.Fprintf(b, `<line x1="%.1f" y1="%d" x2="%.1f" y2="%d"/>`+"\n",
|
||||||
|
x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSingleAxisY(b *strings.Builder, layout chartLayout, scale chartScale) {
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, layout.PlotTop, layout.PlotLeft, layout.PlotBottom)
|
||||||
|
for _, tick := range scale.Ticks {
|
||||||
|
y := chartYForValue(tick, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%.1f" x2="%d" y2="%.1f" stroke="#64748b" stroke-width="1"/>`+"\n",
|
||||||
|
layout.PlotLeft, y, layout.PlotLeft-6, y)
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="%.1f" text-anchor="end" dy="4" font-family="sans-serif" font-size="10" fill="#475569">%s</text>`+"\n",
|
||||||
|
layout.PlotLeft-10, y, sanitizeChartText(chartYAxisNumber(tick)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeXAxisLabels(b *strings.Builder, layout chartLayout, times []time.Time, labels []string, start, end time.Time, target int) {
|
||||||
|
pointCount := len(labels)
|
||||||
|
if len(times) > pointCount {
|
||||||
|
pointCount = len(times)
|
||||||
|
}
|
||||||
|
b.WriteString(`<g font-family="sans-serif" font-size="11" fill="#64748b" text-anchor="middle">` + "\n")
|
||||||
|
for _, idx := range gpuChartLabelIndices(pointCount, target) {
|
||||||
|
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
label := ""
|
||||||
|
if idx < len(labels) {
|
||||||
|
label = labels[idx]
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<text x="%.1f" y="%d">%s</text>`+"\n", x, layout.PlotBottom+28, sanitizeChartText(label))
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
fmt.Fprintf(b, `<text x="%d" y="%d" text-anchor="middle" font-family="sans-serif" font-size="12" fill="#64748b">Time</text>`+"\n",
|
||||||
|
(layout.PlotLeft+layout.PlotRight)/2, layout.PlotBottom+48)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Time, start, end time.Time, values []float64, scale chartScale, color string) {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var points strings.Builder
|
||||||
|
for idx, value := range values {
|
||||||
|
if idx > 0 {
|
||||||
|
points.WriteByte(' ')
|
||||||
|
}
|
||||||
|
x := chartXForTime(chartPointTime(times, idx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(value, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
points.WriteString(strconv.FormatFloat(x, 'f', 1, 64))
|
||||||
|
points.WriteByte(',')
|
||||||
|
points.WriteString(strconv.FormatFloat(y, 'f', 1, 64))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(b, `<polyline points="%s" fill="none" stroke="%s" stroke-width="2.2" stroke-linejoin="round" stroke-linecap="round"/>`+"\n",
|
||||||
|
points.String(), color)
|
||||||
|
if len(values) == 1 {
|
||||||
|
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
peakIdx := 0
|
||||||
|
peakValue := values[0]
|
||||||
|
for idx, value := range values[1:] {
|
||||||
|
if value >= peakValue {
|
||||||
|
peakIdx = idx + 1
|
||||||
|
peakValue = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||||
|
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||||
|
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||||
|
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||||
|
if !chartLegendVisible(len(series)) || len(series) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
cols := 4
|
||||||
|
if len(series) < cols {
|
||||||
|
cols = len(series)
|
||||||
|
}
|
||||||
|
cellWidth := float64(layout.PlotRight-layout.PlotLeft) / float64(cols)
|
||||||
|
baseY := layout.PlotBottom + 74
|
||||||
|
for i, item := range series {
|
||||||
|
row := i / cols
|
||||||
|
col := i % cols
|
||||||
|
x := float64(layout.PlotLeft) + cellWidth*float64(col) + 8
|
||||||
|
y := float64(baseY + row*24)
|
||||||
|
fmt.Fprintf(b, `<line x1="%.1f" y1="%.1f" x2="%.1f" y2="%.1f" stroke="%s" stroke-width="3"/>`+"\n",
|
||||||
|
x, y, x+28, y, item.Color)
|
||||||
|
fmt.Fprintf(b, `<text x="%.1f" y="%.1f" font-family="sans-serif" font-size="12" fill="#1f2937">%s</text>`+"\n",
|
||||||
|
x+38, y+4, sanitizeChartText(item.Name))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTimelineIdleSpans(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||||
|
if len(segments) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
b.WriteString(`<g data-role="timeline-overlay">` + "\n")
|
||||||
|
for _, segment := range segments {
|
||||||
|
if segment.Active || !segment.End.After(segment.Start) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
x0 := chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
x1 := chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)
|
||||||
|
fmt.Fprintf(b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="#475569" opacity="0.10"/>`+"\n",
|
||||||
|
x0, layout.PlotTop, math.Max(1, x1-x0), layout.PlotBottom-layout.PlotTop)
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end time.Time, segments []chartTimelineSegment) {
|
||||||
|
if len(segments) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
seen := map[int]bool{}
|
||||||
|
b.WriteString(`<g data-role="timeline-boundaries" stroke="#94a3b8" stroke-width="1.2">` + "\n")
|
||||||
|
for i, segment := range segments {
|
||||||
|
if i > 0 {
|
||||||
|
x := int(math.Round(chartXForTime(segment.Start, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||||
|
if !seen[x] {
|
||||||
|
seen[x] = true
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if i < len(segments)-1 {
|
||||||
|
x := int(math.Round(chartXForTime(segment.End, start, end, layout.PlotLeft, layout.PlotRight)))
|
||||||
|
if !seen[x] {
|
||||||
|
seen[x] = true
|
||||||
|
fmt.Fprintf(b, `<line x1="%d" y1="%d" x2="%d" y2="%d"/>`+"\n", x, layout.PlotTop, x, layout.PlotBottom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString(`</g>` + "\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||||
|
if !end.After(start) {
|
||||||
|
return float64(left+right) / 2
|
||||||
|
}
|
||||||
|
if ts.Before(start) {
|
||||||
|
ts = start
|
||||||
|
}
|
||||||
|
if ts.After(end) {
|
||||||
|
ts = end
|
||||||
|
}
|
||||||
|
ratio := float64(ts.Sub(start)) / float64(end.Sub(start))
|
||||||
|
return float64(left) + ratio*float64(right-left)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartPointTime(times []time.Time, idx int) time.Time {
|
||||||
|
if idx >= 0 && idx < len(times) && !times[idx].IsZero() {
|
||||||
|
return times[idx].UTC()
|
||||||
|
}
|
||||||
|
if len(times) > 0 && !times[0].IsZero() {
|
||||||
|
return times[0].UTC().Add(time.Duration(idx) * time.Minute)
|
||||||
|
}
|
||||||
|
return time.Now().UTC().Add(time.Duration(idx) * time.Minute)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartYForValue(value float64, scale chartScale, plotTop, plotBottom int) float64 {
|
||||||
|
if scale.Max <= scale.Min {
|
||||||
|
return float64(plotTop+plotBottom) / 2
|
||||||
|
}
|
||||||
|
return float64(plotBottom) - (value-scale.Min)/(scale.Max-scale.Min)*float64(plotBottom-plotTop)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartSeriesBounds(values []float64) (float64, float64) {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
min, max := values[0], values[0]
|
||||||
|
for _, value := range values[1:] {
|
||||||
|
if value < min {
|
||||||
|
min = value
|
||||||
|
}
|
||||||
|
if value > max {
|
||||||
|
max = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if min == max {
|
||||||
|
if max == 0 {
|
||||||
|
return 0, 1
|
||||||
|
}
|
||||||
|
pad := math.Abs(max) * 0.1
|
||||||
|
if pad == 0 {
|
||||||
|
pad = 1
|
||||||
|
}
|
||||||
|
min -= pad
|
||||||
|
max += pad
|
||||||
|
}
|
||||||
|
if min > 0 {
|
||||||
|
pad := (max - min) * 0.2
|
||||||
|
if pad == 0 {
|
||||||
|
pad = max * 0.1
|
||||||
|
}
|
||||||
|
min -= pad
|
||||||
|
if min < 0 {
|
||||||
|
min = 0
|
||||||
|
}
|
||||||
|
max += pad
|
||||||
|
}
|
||||||
|
return min, max
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartNiceTicks(min, max float64, target int) []float64 {
|
||||||
|
if min == max {
|
||||||
|
max = min + 1
|
||||||
|
}
|
||||||
|
span := max - min
|
||||||
|
step := math.Pow(10, math.Floor(math.Log10(span/float64(target))))
|
||||||
|
for _, factor := range []float64{1, 2, 5, 10} {
|
||||||
|
if span/(factor*step) <= float64(target)*1.5 {
|
||||||
|
step = factor * step
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
low := math.Floor(min/step) * step
|
||||||
|
high := math.Ceil(max/step) * step
|
||||||
|
var ticks []float64
|
||||||
|
for value := low; value <= high+step*0.001; value += step {
|
||||||
|
ticks = append(ticks, math.Round(value*1e9)/1e9)
|
||||||
|
}
|
||||||
|
return ticks
|
||||||
|
}
|
||||||
|
|
||||||
|
func valueClamp(value float64, scale chartScale) float64 {
|
||||||
|
if value < scale.Min {
|
||||||
|
return scale.Min
|
||||||
|
}
|
||||||
|
if value > scale.Max {
|
||||||
|
return scale.Max
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartStatsLabel(datasets [][]float64) string {
|
||||||
|
mn, avg, mx := globalStats(datasets)
|
||||||
|
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("min %s avg %s max %s",
|
||||||
|
chartLegendNumber(mn),
|
||||||
|
chartLegendNumber(avg),
|
||||||
|
chartLegendNumber(mx),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDisplayLabel(idx int) string {
|
||||||
|
if name := gpuModelNameByIndex(idx); name != "" {
|
||||||
|
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("GPU %d", idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuModelNameByIndex(idx int) string {
|
||||||
|
now := time.Now()
|
||||||
|
gpuLabelCache.mu.Lock()
|
||||||
|
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||||
|
gpuLabelCache.loadedAt = now
|
||||||
|
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||||
|
gpuLabelCache.mu.Unlock()
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadGPUModelNames() map[int]string {
|
||||||
|
out := map[int]string{}
|
||||||
|
gpus, err := platform.New().ListNvidiaGPUs()
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
name := strings.TrimSpace(gpu.Name)
|
||||||
|
if name != "" {
|
||||||
|
out[gpu.Index] = name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
@@ -9,13 +9,14 @@ import (
|
|||||||
|
|
||||||
// jobState holds the output lines and completion status of an async job.
|
// jobState holds the output lines and completion status of an async job.
|
||||||
type jobState struct {
|
type jobState struct {
|
||||||
lines []string
|
lines []string
|
||||||
done bool
|
done bool
|
||||||
err string
|
err string
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
subs []chan string
|
subs []chan string
|
||||||
cancel func() // optional cancel function; nil if job is not cancellable
|
cancel func() // optional cancel function; nil if job is not cancellable
|
||||||
logPath string
|
logPath string
|
||||||
|
serialPrefix string
|
||||||
}
|
}
|
||||||
|
|
||||||
// abort cancels the job if it has a cancel function and is not yet done.
|
// abort cancels the job if it has a cancel function and is not yet done.
|
||||||
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
|
|||||||
if j.logPath != "" {
|
if j.logPath != "" {
|
||||||
appendJobLog(j.logPath, line)
|
appendJobLog(j.logPath, line)
|
||||||
}
|
}
|
||||||
|
if j.serialPrefix != "" {
|
||||||
|
taskSerialWriteLine(j.serialPrefix + line)
|
||||||
|
}
|
||||||
for _, ch := range j.subs {
|
for _, ch := range j.subs {
|
||||||
select {
|
select {
|
||||||
case ch <- line:
|
case ch <- line:
|
||||||
@@ -84,12 +88,12 @@ func (m *jobManager) create(id string) *jobState {
|
|||||||
j := &jobState{}
|
j := &jobState{}
|
||||||
m.jobs[id] = j
|
m.jobs[id] = j
|
||||||
// Schedule cleanup after 30 minutes
|
// Schedule cleanup after 30 minutes
|
||||||
go func() {
|
goRecoverOnce("job cleanup", func() {
|
||||||
time.Sleep(30 * time.Minute)
|
time.Sleep(30 * time.Minute)
|
||||||
m.mu.Lock()
|
m.mu.Lock()
|
||||||
delete(m.jobs, id)
|
delete(m.jobs, id)
|
||||||
m.mu.Unlock()
|
m.mu.Unlock()
|
||||||
}()
|
})
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
|||||||
return j, ok
|
return j, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func newTaskJobState(logPath string) *jobState {
|
func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||||
j := &jobState{logPath: logPath}
|
j := &jobState{logPath: logPath}
|
||||||
|
if len(serialPrefix) > 0 {
|
||||||
|
j.serialPrefix = serialPrefix[0]
|
||||||
|
}
|
||||||
if logPath == "" {
|
if logPath == "" {
|
||||||
return j
|
return j
|
||||||
}
|
}
|
||||||
|
|||||||
242
audit/internal/webui/kmsg_watcher.go
Normal file
242
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||||
|
// It supports multiple concurrent SAT tasks: a shared event window is open
|
||||||
|
// while any SAT task is running, and flushed when all tasks complete.
|
||||||
|
type kmsgWatcher struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
activeCount int // number of in-flight SAT tasks
|
||||||
|
window *kmsgWindow
|
||||||
|
statusDB *app.ComponentStatusDB
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgWindow struct {
|
||||||
|
targets []string // SAT targets running concurrently
|
||||||
|
startedAt time.Time
|
||||||
|
seen map[kmsgEventKey]bool
|
||||||
|
events []kmsgEvent
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEventKey struct {
|
||||||
|
id string // BDF or device name
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
type kmsgEvent struct {
|
||||||
|
timestamp time.Time
|
||||||
|
raw string
|
||||||
|
ids []string // BDF addresses or device names extracted
|
||||||
|
category string
|
||||||
|
}
|
||||||
|
|
||||||
|
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||||
|
return &kmsgWatcher{statusDB: statusDB}
|
||||||
|
}
|
||||||
|
|
||||||
|
// start launches the background kmsg reading goroutine.
|
||||||
|
func (w *kmsgWatcher) start() {
|
||||||
|
goRecoverLoop("kmsg watcher", 5*time.Second, w.run)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) run() {
|
||||||
|
for {
|
||||||
|
f, err := os.Open("/dev/kmsg")
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||||
|
time.Sleep(30 * time.Second)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Best-effort seek to end so we only capture events from now forward.
|
||||||
|
_, _ = f.Seek(0, io.SeekEnd)
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
evt, ok := parseKmsgLine(line)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
w.mu.Lock()
|
||||||
|
if w.window != nil {
|
||||||
|
w.recordEvent(evt)
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
slog.Warn("kmsg watcher stopped", "err", err)
|
||||||
|
}
|
||||||
|
_ = f.Close()
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||||
|
// Must be called with w.mu held.
|
||||||
|
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
key := kmsgEventKey{id: "", category: evt.category}
|
||||||
|
if !w.window.seen[key] {
|
||||||
|
w.window.seen[key] = true
|
||||||
|
w.window.events = append(w.window.events, evt)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
key := kmsgEventKey{id: id, category: evt.category}
|
||||||
|
if !w.window.seen[key] {
|
||||||
|
w.window.seen[key] = true
|
||||||
|
w.window.events = append(w.window.events, evt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskStarted increments the active task counter and opens a shared event window
|
||||||
|
// if this is the first task starting.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
defer w.mu.Unlock()
|
||||||
|
if w.activeCount == 0 {
|
||||||
|
w.window = &kmsgWindow{
|
||||||
|
startedAt: time.Now(),
|
||||||
|
seen: make(map[kmsgEventKey]bool),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.activeCount++
|
||||||
|
if w.window != nil {
|
||||||
|
w.window.targets = append(w.window.targets, target)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NotifyTaskFinished decrements the active task counter. When all tasks finish,
|
||||||
|
// it flushes the accumulated events to the status DB.
|
||||||
|
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||||
|
w.mu.Lock()
|
||||||
|
w.activeCount--
|
||||||
|
var window *kmsgWindow
|
||||||
|
if w.activeCount <= 0 {
|
||||||
|
w.activeCount = 0
|
||||||
|
window = w.window
|
||||||
|
w.window = nil
|
||||||
|
}
|
||||||
|
w.mu.Unlock()
|
||||||
|
|
||||||
|
if window == nil || len(window.events) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
goRecoverOnce("kmsg watcher flush", func() { w.flushWindow(window) })
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||||
|
if w.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
source := "watchdog:kmsg"
|
||||||
|
// Collect unique component keys from events.
|
||||||
|
seen := map[string]string{} // componentKey → first raw line
|
||||||
|
for _, evt := range window.events {
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
// MCE or un-identified error.
|
||||||
|
key := "cpu:all"
|
||||||
|
if evt.category == "memory" {
|
||||||
|
key = "memory:all"
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
var key string
|
||||||
|
switch evt.category {
|
||||||
|
case "gpu", "pcie":
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
case "storage":
|
||||||
|
key = "storage:" + id
|
||||||
|
default:
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
}
|
||||||
|
if _, exists := seen[key]; !exists {
|
||||||
|
seen[key] = evt.raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for key, detail := range seen {
|
||||||
|
detail = "kernel error during SAT (" + strings.Join(window.targets, ",") + "): " + truncate(detail, 120)
|
||||||
|
w.statusDB.Record(key, source, "Warning", detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||||
|
// any pattern in platform.HardwareErrorPatterns.
|
||||||
|
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||||
|
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||||
|
msg := raw
|
||||||
|
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||||
|
msg = strings.TrimSpace(raw[idx+1:])
|
||||||
|
}
|
||||||
|
if msg == "" {
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, p := range platform.HardwareErrorPatterns {
|
||||||
|
m := p.Re.FindStringSubmatch(msg)
|
||||||
|
if m == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
evt := kmsgEvent{
|
||||||
|
timestamp: time.Now(),
|
||||||
|
raw: msg,
|
||||||
|
category: p.Category,
|
||||||
|
}
|
||||||
|
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||||
|
}
|
||||||
|
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||||
|
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||||
|
}
|
||||||
|
return evt, true
|
||||||
|
}
|
||||||
|
return kmsgEvent{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||||
|
func normalizeBDF(bdf string) string {
|
||||||
|
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||||
|
if strings.Count(bdf, ":") == 1 {
|
||||||
|
return "0000:" + bdf
|
||||||
|
}
|
||||||
|
return bdf
|
||||||
|
}
|
||||||
|
|
||||||
|
func truncate(s string, max int) string {
|
||||||
|
if len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return s[:max] + "..."
|
||||||
|
}
|
||||||
|
|
||||||
|
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||||
|
func isSATTarget(target string) bool {
|
||||||
|
switch target {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
|
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||||
|
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||||
|
"platform-stress":
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
@@ -8,6 +8,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
@@ -21,6 +22,13 @@ type MetricsDB struct {
|
|||||||
db *sql.DB
|
db *sql.DB
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *MetricsDB) Close() error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return m.db.Close()
|
||||||
|
}
|
||||||
|
|
||||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
@@ -54,6 +62,8 @@ CREATE TABLE IF NOT EXISTS gpu_metrics (
|
|||||||
usage_pct REAL,
|
usage_pct REAL,
|
||||||
mem_usage_pct REAL,
|
mem_usage_pct REAL,
|
||||||
power_w REAL,
|
power_w REAL,
|
||||||
|
clock_mhz REAL,
|
||||||
|
mem_clock_mhz REAL,
|
||||||
PRIMARY KEY (ts, gpu_index)
|
PRIMARY KEY (ts, gpu_index)
|
||||||
);
|
);
|
||||||
CREATE TABLE IF NOT EXISTS fan_metrics (
|
CREATE TABLE IF NOT EXISTS fan_metrics (
|
||||||
@@ -70,6 +80,38 @@ CREATE TABLE IF NOT EXISTS temp_metrics (
|
|||||||
PRIMARY KEY (ts, name)
|
PRIMARY KEY (ts, name)
|
||||||
);
|
);
|
||||||
`)
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := ensureMetricsColumn(db, "gpu_metrics", "clock_mhz", "REAL"); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return ensureMetricsColumn(db, "gpu_metrics", "mem_clock_mhz", "REAL")
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureMetricsColumn(db *sql.DB, table, column, definition string) error {
|
||||||
|
rows, err := db.Query("PRAGMA table_info(" + table + ")")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
for rows.Next() {
|
||||||
|
var cid int
|
||||||
|
var name, ctype string
|
||||||
|
var notNull, pk int
|
||||||
|
var dflt sql.NullString
|
||||||
|
if err := rows.Scan(&cid, &name, &ctype, ¬Null, &dflt, &pk); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if strings.EqualFold(name, column) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = db.Exec("ALTER TABLE " + table + " ADD COLUMN " + column + " " + definition)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,8 +133,8 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
}
|
}
|
||||||
for _, g := range s.GPUs {
|
for _, g := range s.GPUs {
|
||||||
_, err = tx.Exec(
|
_, err = tx.Exec(
|
||||||
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w) VALUES(?,?,?,?,?,?)`,
|
`INSERT OR REPLACE INTO gpu_metrics(ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz) VALUES(?,?,?,?,?,?,?,?)`,
|
||||||
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW,
|
ts, g.GPUIndex, g.TempC, g.UsagePct, g.MemUsagePct, g.PowerW, g.ClockMHz, g.MemClockMHz,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -129,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
|||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadBetween returns samples in chronological order within the given time window.
|
||||||
|
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||||
|
if m == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if start.IsZero() || end.IsZero() {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
start, end = end, start
|
||||||
|
}
|
||||||
|
return m.loadSamples(
|
||||||
|
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||||
|
start.Unix(), end.Unix(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||||
rows, err := m.db.Query(query, args...)
|
rows, err := m.db.Query(query, args...)
|
||||||
@@ -163,7 +222,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
}
|
}
|
||||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||||
gRows, err := m.db.Query(
|
gRows, err := m.db.Query(
|
||||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w,IFNULL(clock_mhz,0),IFNULL(mem_clock_mhz,0) FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||||
minTS, maxTS,
|
minTS, maxTS,
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -171,7 +230,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
for gRows.Next() {
|
for gRows.Next() {
|
||||||
var ts int64
|
var ts int64
|
||||||
var g platform.GPUMetricRow
|
var g platform.GPUMetricRow
|
||||||
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW); err == nil {
|
if err := gRows.Scan(&ts, &g.GPUIndex, &g.TempC, &g.UsagePct, &g.MemUsagePct, &g.PowerW, &g.ClockMHz, &g.MemClockMHz); err == nil {
|
||||||
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
gpuData[gpuKey{ts, g.GPUIndex}] = g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -283,7 +342,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||||
rows, err := m.db.Query(`
|
rows, err := m.db.Query(`
|
||||||
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
SELECT s.ts, s.cpu_load_pct, s.mem_load_pct, s.power_w,
|
||||||
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w
|
g.gpu_index, g.temp_c, g.usage_pct, g.mem_usage_pct, g.power_w,
|
||||||
|
g.clock_mhz, g.mem_clock_mhz
|
||||||
FROM sys_metrics s
|
FROM sys_metrics s
|
||||||
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
LEFT JOIN gpu_metrics g ON g.ts = s.ts
|
||||||
ORDER BY s.ts, g.gpu_index
|
ORDER BY s.ts, g.gpu_index
|
||||||
@@ -294,13 +354,13 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
|||||||
defer rows.Close()
|
defer rows.Close()
|
||||||
|
|
||||||
cw := csv.NewWriter(w)
|
cw := csv.NewWriter(w)
|
||||||
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w"})
|
_ = cw.Write([]string{"ts", "cpu_load_pct", "mem_load_pct", "sys_power_w", "gpu_index", "gpu_temp_c", "gpu_usage_pct", "gpu_mem_pct", "gpu_power_w", "gpu_clock_mhz", "gpu_mem_clock_mhz"})
|
||||||
for rows.Next() {
|
for rows.Next() {
|
||||||
var ts int64
|
var ts int64
|
||||||
var cpu, mem, pwr float64
|
var cpu, mem, pwr float64
|
||||||
var gpuIdx sql.NullInt64
|
var gpuIdx sql.NullInt64
|
||||||
var gpuTemp, gpuUse, gpuMem, gpuPow sql.NullFloat64
|
var gpuTemp, gpuUse, gpuMem, gpuPow, gpuClock, gpuMemClock sql.NullFloat64
|
||||||
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow); err != nil {
|
if err := rows.Scan(&ts, &cpu, &mem, &pwr, &gpuIdx, &gpuTemp, &gpuUse, &gpuMem, &gpuPow, &gpuClock, &gpuMemClock); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
row := []string{
|
row := []string{
|
||||||
@@ -316,9 +376,11 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
|||||||
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
strconv.FormatFloat(gpuUse.Float64, 'f', 1, 64),
|
||||||
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
strconv.FormatFloat(gpuMem.Float64, 'f', 1, 64),
|
||||||
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
strconv.FormatFloat(gpuPow.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuClock.Float64, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(gpuMemClock.Float64, 'f', 1, 64),
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
row = append(row, "", "", "", "", "")
|
row = append(row, "", "", "", "", "", "", "")
|
||||||
}
|
}
|
||||||
_ = cw.Write(row)
|
_ = cw.Write(row)
|
||||||
}
|
}
|
||||||
@@ -326,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
|||||||
return cw.Error()
|
return cw.Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close closes the database.
|
|
||||||
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
|
||||||
|
|
||||||
func nullFloat(v float64) sql.NullFloat64 {
|
func nullFloat(v float64) sql.NullFloat64 {
|
||||||
return sql.NullFloat64{Float64: v, Valid: true}
|
return sql.NullFloat64{Float64: v, Valid: true}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"database/sql"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
|
_ "modernc.org/sqlite"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||||
@@ -67,3 +69,106 @@ func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMetricsDBMigratesLegacyGPUSchema(t *testing.T) {
|
||||||
|
path := filepath.Join(t.TempDir(), "metrics.db")
|
||||||
|
raw, err := sql.Open("sqlite", path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("sql.Open: %v", err)
|
||||||
|
}
|
||||||
|
_, err = raw.Exec(`
|
||||||
|
CREATE TABLE gpu_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
gpu_index INTEGER NOT NULL,
|
||||||
|
temp_c REAL,
|
||||||
|
usage_pct REAL,
|
||||||
|
mem_usage_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts, gpu_index)
|
||||||
|
);
|
||||||
|
CREATE TABLE sys_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
cpu_load_pct REAL,
|
||||||
|
mem_load_pct REAL,
|
||||||
|
power_w REAL,
|
||||||
|
PRIMARY KEY (ts)
|
||||||
|
);
|
||||||
|
CREATE TABLE fan_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
rpm REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
CREATE TABLE temp_metrics (
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
grp TEXT NOT NULL,
|
||||||
|
celsius REAL,
|
||||||
|
PRIMARY KEY (ts, name)
|
||||||
|
);
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("create legacy schema: %v", err)
|
||||||
|
}
|
||||||
|
_ = raw.Close()
|
||||||
|
|
||||||
|
db, err := openMetricsDB(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
now := time.Unix(1_700_000_100, 0).UTC()
|
||||||
|
err = db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: now,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2600},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
samples, err := db.LoadAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadAll: %v", err)
|
||||||
|
}
|
||||||
|
if len(samples) != 1 || len(samples[0].GPUs) != 1 {
|
||||||
|
t.Fatalf("samples=%+v", samples)
|
||||||
|
}
|
||||||
|
if got := samples[0].GPUs[0].ClockMHz; got != 1410 {
|
||||||
|
t.Fatalf("ClockMHz=%v want 1410", got)
|
||||||
|
}
|
||||||
|
if got := samples[0].GPUs[0].MemClockMHz; got != 2600 {
|
||||||
|
t.Fatalf("MemClockMHz=%v want 2600", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
|
||||||
|
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
base := time.Unix(1_700_000_000, 0).UTC()
|
||||||
|
for i := 0; i < 5; i++ {
|
||||||
|
if err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base.Add(time.Duration(i) * time.Minute),
|
||||||
|
CPULoadPct: float64(i),
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("Write(%d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadBetween: %v", err)
|
||||||
|
}
|
||||||
|
if len(got) != 3 {
|
||||||
|
t.Fatalf("LoadBetween len=%d want 3", len(got))
|
||||||
|
}
|
||||||
|
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
|
||||||
|
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
41
audit/internal/webui/serial_console.go
Normal file
41
audit/internal/webui/serial_console.go
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var taskSerialWriteLine = writeTaskSerialLine
|
||||||
|
|
||||||
|
func writeTaskSerialLine(line string) {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
|
||||||
|
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
|
||||||
|
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_, _ = f.WriteString(payload)
|
||||||
|
_ = f.Close()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskSerialPrefix(t *Task) string {
|
||||||
|
if t == nil {
|
||||||
|
return "[task] "
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskSerialEvent(t *Task, event string) {
|
||||||
|
if t == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
|
||||||
|
}
|
||||||
@@ -1,15 +1,19 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"html"
|
"html"
|
||||||
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"mime"
|
"mime"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"runtime/debug"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -18,7 +22,6 @@ import (
|
|||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
"bee/audit/internal/runtimeenv"
|
"bee/audit/internal/runtimeenv"
|
||||||
gocharts "github.com/go-analyze/charts"
|
|
||||||
"reanimator/chart/viewer"
|
"reanimator/chart/viewer"
|
||||||
"reanimator/chart/web"
|
"reanimator/chart/web"
|
||||||
)
|
)
|
||||||
@@ -164,6 +167,8 @@ type handler struct {
|
|||||||
// pending network change (rollback on timeout)
|
// pending network change (rollback on timeout)
|
||||||
pendingNet *pendingNetChange
|
pendingNet *pendingNetChange
|
||||||
pendingNetMu sync.Mutex
|
pendingNetMu sync.Mutex
|
||||||
|
// kmsg hardware error watcher
|
||||||
|
kmsg *kmsgWatcher
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewHandler creates the HTTP mux with all routes.
|
// NewHandler creates the HTTP mux with all routes.
|
||||||
@@ -203,12 +208,24 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
}
|
}
|
||||||
h.startMetricsCollector()
|
h.startMetricsCollector()
|
||||||
|
|
||||||
|
// Start kmsg hardware error watcher if the app (and its status DB) is available.
|
||||||
|
if opts.App != nil {
|
||||||
|
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||||
|
h.kmsg.start()
|
||||||
|
globalQueue.kmsgWatcher = h.kmsg
|
||||||
|
}
|
||||||
|
|
||||||
globalQueue.startWorker(&opts)
|
globalQueue.startWorker(&opts)
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
|
|
||||||
// ── Infrastructure ──────────────────────────────────────────────────────
|
// ── Infrastructure ──────────────────────────────────────────────────────
|
||||||
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
||||||
mux.HandleFunc("GET /api/ready", h.handleReady)
|
mux.HandleFunc("GET /api/ready", h.handleReady)
|
||||||
|
mux.HandleFunc("GET /loading", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(loadingPageHTML))
|
||||||
|
})
|
||||||
|
|
||||||
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
||||||
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
||||||
@@ -225,6 +242,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
|
|
||||||
// SAT
|
// SAT
|
||||||
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-targeted-stress/run", h.handleAPISATRun("nvidia-targeted-stress"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-compute/run", h.handleAPISATRun("nvidia-compute"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-targeted-power/run", h.handleAPISATRun("nvidia-targeted-power"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-pulse/run", h.handleAPISATRun("nvidia-pulse"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-interconnect/run", h.handleAPISATRun("nvidia-interconnect"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-bandwidth/run", h.handleAPISATRun("nvidia-bandwidth"))
|
||||||
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||||
@@ -238,6 +261,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
|
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun)
|
||||||
|
|
||||||
// Tasks
|
// Tasks
|
||||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||||
@@ -246,6 +270,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
||||||
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
||||||
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
||||||
|
mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
|
||||||
|
mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
|
||||||
|
mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
|
||||||
|
|
||||||
// Services
|
// Services
|
||||||
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
|
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
|
||||||
@@ -274,6 +301,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
|
|
||||||
// GPU presence / tools
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
|
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||||
|
|
||||||
// System
|
// System
|
||||||
@@ -300,11 +328,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("GET /", h.handlePage)
|
mux.HandleFunc("GET /", h.handlePage)
|
||||||
|
|
||||||
h.mux = mux
|
h.mux = mux
|
||||||
return mux
|
return recoverMiddleware(mux)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) startMetricsCollector() {
|
func (h *handler) startMetricsCollector() {
|
||||||
go func() {
|
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||||
ticker := time.NewTicker(metricsCollectInterval)
|
ticker := time.NewTicker(metricsCollectInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
@@ -315,7 +343,7 @@ func (h *handler) startMetricsCollector() {
|
|||||||
h.feedRings(sample)
|
h.feedRings(sample)
|
||||||
h.setLatestMetric(sample)
|
h.setLatestMetric(sample)
|
||||||
}
|
}
|
||||||
}()
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||||
@@ -336,7 +364,81 @@ func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
|||||||
|
|
||||||
// ListenAndServe starts the HTTP server.
|
// ListenAndServe starts the HTTP server.
|
||||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||||
return http.ListenAndServe(addr, NewHandler(opts))
|
srv := &http.Server{
|
||||||
|
Addr: addr,
|
||||||
|
Handler: NewHandler(opts),
|
||||||
|
ReadHeaderTimeout: 5 * time.Second,
|
||||||
|
ReadTimeout: 30 * time.Second,
|
||||||
|
IdleTimeout: 2 * time.Minute,
|
||||||
|
}
|
||||||
|
return srv.ListenAndServe()
|
||||||
|
}
|
||||||
|
|
||||||
|
type trackingResponseWriter struct {
|
||||||
|
http.ResponseWriter
|
||||||
|
wroteHeader bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) WriteHeader(statusCode int) {
|
||||||
|
w.wroteHeader = true
|
||||||
|
w.ResponseWriter.WriteHeader(statusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Write(p []byte) (int, error) {
|
||||||
|
w.wroteHeader = true
|
||||||
|
return w.ResponseWriter.Write(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Flush() {
|
||||||
|
w.wroteHeader = true
|
||||||
|
if f, ok := w.ResponseWriter.(http.Flusher); ok {
|
||||||
|
f.Flush()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
|
||||||
|
h, ok := w.ResponseWriter.(http.Hijacker)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil, fmt.Errorf("hijacking not supported")
|
||||||
|
}
|
||||||
|
return h.Hijack()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) Push(target string, opts *http.PushOptions) error {
|
||||||
|
p, ok := w.ResponseWriter.(http.Pusher)
|
||||||
|
if !ok {
|
||||||
|
return http.ErrNotSupported
|
||||||
|
}
|
||||||
|
return p.Push(target, opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *trackingResponseWriter) ReadFrom(r io.Reader) (int64, error) {
|
||||||
|
rf, ok := w.ResponseWriter.(io.ReaderFrom)
|
||||||
|
if !ok {
|
||||||
|
return io.Copy(w.ResponseWriter, r)
|
||||||
|
}
|
||||||
|
w.wroteHeader = true
|
||||||
|
return rf.ReadFrom(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
func recoverMiddleware(next http.Handler) http.Handler {
|
||||||
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
tw := &trackingResponseWriter{ResponseWriter: w}
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
slog.Error("http handler panic",
|
||||||
|
"method", r.Method,
|
||||||
|
"path", r.URL.Path,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
if !tw.wroteHeader {
|
||||||
|
http.Error(tw, "internal server error", http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
next.ServeHTTP(tw, r)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Infrastructure handlers ──────────────────────────────────────────────────
|
// ── Infrastructure handlers ──────────────────────────────────────────────────
|
||||||
@@ -466,13 +568,44 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
|
samples, err := h.metricsDB.LoadAll()
|
||||||
|
if err != nil || len(samples) == 0 {
|
||||||
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
timeline := metricsTimelineSegments(samples, time.Now())
|
||||||
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
|
buf, ok, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||||
if !ok {
|
if !ok {
|
||||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
buf, err := renderMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMin,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
@@ -482,14 +615,6 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
|
||||||
samples, err := h.metricsDB.LoadAll()
|
|
||||||
if err != nil || len(samples) == 0 {
|
|
||||||
return nil, nil, nil, "", nil, nil, false
|
|
||||||
}
|
|
||||||
return chartDataFromSamples(path, samples)
|
|
||||||
}
|
|
||||||
|
|
||||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||||
var datasets [][]float64
|
var datasets [][]float64
|
||||||
var names []string
|
var names []string
|
||||||
@@ -569,18 +694,24 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(datasets...)
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-clock":
|
||||||
|
title = "GPU Core Clock"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-memclock":
|
||||||
|
title = "GPU Memory Clock"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
case strings.HasPrefix(path, "gpu/"):
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
rest := strings.TrimPrefix(path, "gpu/")
|
idx, sub, ok := parseGPUChartPath(path)
|
||||||
sub := ""
|
if !ok {
|
||||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
return nil, nil, nil, "", nil, nil, false
|
||||||
sub = rest[i+1:]
|
|
||||||
rest = rest[:i]
|
|
||||||
}
|
}
|
||||||
idx := 0
|
|
||||||
fmt.Sscanf(rest, "%d", &idx)
|
|
||||||
switch sub {
|
switch sub {
|
||||||
case "load":
|
case "load":
|
||||||
title = fmt.Sprintf("GPU %d Load", idx)
|
title = gpuDisplayLabel(idx) + " Load"
|
||||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
if util == nil && mem == nil {
|
if util == nil && mem == nil {
|
||||||
@@ -591,7 +722,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = floatPtr(100)
|
yMax = floatPtr(100)
|
||||||
case "temp":
|
case "temp":
|
||||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
title = gpuDisplayLabel(idx) + " Temperature"
|
||||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
if temp == nil {
|
if temp == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false
|
||||||
@@ -600,8 +731,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
names = []string{"Temp °C"}
|
names = []string{"Temp °C"}
|
||||||
yMin = floatPtr(0)
|
yMin = floatPtr(0)
|
||||||
yMax = autoMax120(temp)
|
yMax = autoMax120(temp)
|
||||||
|
case "clock":
|
||||||
|
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||||
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||||
|
if clock == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{clock}
|
||||||
|
names = []string{"Core Clock MHz"}
|
||||||
|
yMin, yMax = autoBounds120(clock)
|
||||||
|
case "memclock":
|
||||||
|
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||||
|
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||||
|
if clock == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{clock}
|
||||||
|
names = []string{"Memory Clock MHz"}
|
||||||
|
yMin, yMax = autoBounds120(clock)
|
||||||
default:
|
default:
|
||||||
title = fmt.Sprintf("GPU %d Power", idx)
|
title = gpuDisplayLabel(idx) + " Power"
|
||||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
if power == nil {
|
if power == nil {
|
||||||
return nil, nil, nil, "", nil, nil, false
|
return nil, nil, nil, "", nil, nil, false
|
||||||
@@ -618,6 +767,26 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
|||||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseGPUChartPath(path string) (idx int, sub string, ok bool) {
|
||||||
|
if !strings.HasPrefix(path, "gpu/") {
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
rest := strings.TrimPrefix(path, "gpu/")
|
||||||
|
if rest == "" {
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
sub = ""
|
||||||
|
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||||
|
sub = rest[i+1:]
|
||||||
|
rest = rest[:i]
|
||||||
|
}
|
||||||
|
n, err := fmt.Sscanf(rest, "%d", &idx)
|
||||||
|
if err != nil || n != 1 {
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
return idx, sub, true
|
||||||
|
}
|
||||||
|
|
||||||
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
||||||
labels := make([]string, len(samples))
|
labels := make([]string, len(samples))
|
||||||
if len(samples) == 0 {
|
if len(samples) == 0 {
|
||||||
@@ -710,7 +879,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
datasets = append(datasets, ds)
|
datasets = append(datasets, ds)
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
names = append(names, gpuDisplayLabel(idx))
|
||||||
}
|
}
|
||||||
return datasets, names
|
return datasets, names
|
||||||
}
|
}
|
||||||
@@ -843,64 +1012,37 @@ func autoBounds120(datasets ...[]float64) (*float64, *float64) {
|
|||||||
return floatPtr(low), floatPtr(high)
|
return floatPtr(low), floatPtr(high)
|
||||||
}
|
}
|
||||||
|
|
||||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
func gpuChartLabelIndices(total, target int) []int {
|
||||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
if total <= 0 {
|
||||||
n := len(labels)
|
return nil
|
||||||
if n == 0 {
|
|
||||||
n = 1
|
|
||||||
labels = []string{""}
|
|
||||||
}
|
}
|
||||||
for i := range datasets {
|
if total == 1 {
|
||||||
if len(datasets[i]) == 0 {
|
return []int{0}
|
||||||
datasets[i] = make([]float64, n)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Append global min/avg/max to title.
|
step := total / target
|
||||||
mn, avg, mx := globalStats(datasets)
|
if step < 1 {
|
||||||
if mx > 0 {
|
step = 1
|
||||||
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
|
|
||||||
title,
|
|
||||||
chartLegendNumber(mn),
|
|
||||||
chartLegendNumber(avg),
|
|
||||||
chartLegendNumber(mx),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
title = sanitizeChartText(title)
|
var indices []int
|
||||||
names = sanitizeChartTexts(names)
|
for i := 0; i < total; i += step {
|
||||||
sparse := sanitizeChartTexts(sparseLabels(labels, 6))
|
indices = append(indices, i)
|
||||||
|
}
|
||||||
|
if indices[len(indices)-1] != total-1 {
|
||||||
|
indices = append(indices, total-1)
|
||||||
|
}
|
||||||
|
return indices
|
||||||
|
}
|
||||||
|
|
||||||
opt := gocharts.NewLineChartOptionWithData(datasets)
|
func chartCanvasHeightForPath(path string, seriesCount int) int {
|
||||||
opt.Title = gocharts.TitleOption{Text: title}
|
height := chartCanvasHeight(seriesCount)
|
||||||
opt.XAxis.Labels = sparse
|
if isGPUChartPath(path) {
|
||||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
return height * 2
|
||||||
if chartLegendVisible(len(names)) {
|
|
||||||
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
|
|
||||||
opt.Legend.OverlayChart = gocharts.Ptr(false)
|
|
||||||
} else {
|
|
||||||
opt.Legend.Show = gocharts.Ptr(false)
|
|
||||||
}
|
|
||||||
opt.Symbol = gocharts.SymbolNone
|
|
||||||
// Right padding: reserve space for the MarkLine label (library recommendation).
|
|
||||||
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
|
||||||
if yMin != nil || yMax != nil {
|
|
||||||
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
|
|
||||||
}
|
}
|
||||||
|
return height
|
||||||
|
}
|
||||||
|
|
||||||
// Add a single peak mark line on the series that holds the global maximum.
|
func isGPUChartPath(path string) bool {
|
||||||
peakIdx, _ := globalPeakSeries(datasets)
|
return strings.HasPrefix(path, "gpu-all-") || strings.HasPrefix(path, "gpu/")
|
||||||
if peakIdx >= 0 && peakIdx < len(opt.SeriesList) {
|
|
||||||
opt.SeriesList[peakIdx].MarkLine = gocharts.NewMarkLine(gocharts.SeriesMarkTypeMax)
|
|
||||||
}
|
|
||||||
|
|
||||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
|
||||||
OutputFormat: gocharts.ChartOutputSVG,
|
|
||||||
Width: 1400,
|
|
||||||
Height: chartCanvasHeight(len(names)),
|
|
||||||
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
|
||||||
if err := p.LineChart(opt); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return p.Bytes()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func chartLegendVisible(seriesCount int) bool {
|
func chartLegendVisible(seriesCount int) bool {
|
||||||
@@ -914,30 +1056,6 @@ func chartCanvasHeight(seriesCount int) int {
|
|||||||
return 288
|
return 288
|
||||||
}
|
}
|
||||||
|
|
||||||
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
|
|
||||||
return gocharts.YAxisOption{
|
|
||||||
Min: yMin,
|
|
||||||
Max: yMax,
|
|
||||||
LabelCount: 11,
|
|
||||||
ValueFormatter: chartYAxisNumber,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// globalPeakSeries returns the index of the series containing the global maximum
|
|
||||||
// value across all datasets, and that maximum value.
|
|
||||||
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
|
||||||
idx = -1
|
|
||||||
for i, ds := range datasets {
|
|
||||||
for _, v := range ds {
|
|
||||||
if v > peak {
|
|
||||||
peak = v
|
|
||||||
idx = i
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return idx, peak
|
|
||||||
}
|
|
||||||
|
|
||||||
// globalStats returns min, average, and max across all values in all datasets.
|
// globalStats returns min, average, and max across all values in all datasets.
|
||||||
func globalStats(datasets [][]float64) (mn, avg, mx float64) {
|
func globalStats(datasets [][]float64) (mn, avg, mx float64) {
|
||||||
var sum float64
|
var sum float64
|
||||||
@@ -977,21 +1095,6 @@ func sanitizeChartText(s string) string {
|
|||||||
}, s))
|
}, s))
|
||||||
}
|
}
|
||||||
|
|
||||||
func sanitizeChartTexts(in []string) []string {
|
|
||||||
out := make([]string, len(in))
|
|
||||||
for i, s := range in {
|
|
||||||
out[i] = sanitizeChartText(s)
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func safeIdx(s []float64, i int) float64 {
|
|
||||||
if i < len(s) {
|
|
||||||
return s[i]
|
|
||||||
}
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
|
func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []string) {
|
||||||
var datasets [][]float64
|
var datasets [][]float64
|
||||||
var names []string
|
var names []string
|
||||||
@@ -1078,20 +1181,6 @@ func chartYAxisNumber(v float64) string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func sparseLabels(labels []string, n int) []string {
|
|
||||||
out := make([]string, len(labels))
|
|
||||||
step := len(labels) / n
|
|
||||||
if step < 1 {
|
|
||||||
step = 1
|
|
||||||
}
|
|
||||||
for i, l := range labels {
|
|
||||||
if i%step == 0 {
|
|
||||||
out[i] = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Request) {
|
||||||
if h.metricsDB == nil {
|
if h.metricsDB == nil {
|
||||||
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||||
@@ -1107,6 +1196,11 @@ func (h *handler) handleAPIMetricsExportCSV(w http.ResponseWriter, r *http.Reque
|
|||||||
|
|
||||||
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleReady(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
if strings.TrimSpace(h.opts.AuditPath) == "" {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, _ = w.Write([]byte("ready"))
|
||||||
|
return
|
||||||
|
}
|
||||||
if _, err := os.Stat(h.opts.AuditPath); err != nil {
|
if _, err := os.Stat(h.opts.AuditPath); err != nil {
|
||||||
w.WriteHeader(http.StatusServiceUnavailable)
|
w.WriteHeader(http.StatusServiceUnavailable)
|
||||||
_, _ = w.Write([]byte("starting"))
|
_, _ = w.Write([]byte("starting"))
|
||||||
@@ -1120,37 +1214,106 @@ const loadingPageHTML = `<!DOCTYPE html>
|
|||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>EASY-BEE</title>
|
<title>EASY-BEE — Starting</title>
|
||||||
<style>
|
<style>
|
||||||
*{margin:0;padding:0;box-sizing:border-box}
|
*{margin:0;padding:0;box-sizing:border-box}
|
||||||
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||||
.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
|
.wrap{text-align:center;width:420px}
|
||||||
.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
|
.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
|
||||||
|
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
|
||||||
|
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
||||||
|
.spinner.hidden{display:none}
|
||||||
@keyframes spin{to{transform:rotate(360deg)}}
|
@keyframes spin{to{transform:rotate(360deg)}}
|
||||||
.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
|
.status{font-size:13px;color:#a0aec0;margin-bottom:20px;min-height:18px}
|
||||||
|
table{width:100%;border-collapse:collapse;font-size:12px;margin-bottom:20px;display:none}
|
||||||
|
td{padding:3px 6px;text-align:left}
|
||||||
|
td:first-child{color:#718096;width:55%}
|
||||||
|
.ok{color:#68d391}
|
||||||
|
.run{color:#f6c90e}
|
||||||
|
.fail{color:#fc8181}
|
||||||
|
.dim{color:#4a5568}
|
||||||
|
.btn{background:#1a202c;color:#a0aec0;border:1px solid #2d3748;padding:7px 18px;font-size:12px;cursor:pointer;font-family:inherit;display:none}
|
||||||
|
.btn:hover{border-color:#718096;color:#e2e8f0}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div style="text-align:center">
|
<div class="wrap">
|
||||||
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
||||||
<div class="spinner"></div>
|
<div class="subtitle">Hardware Audit LiveCD</div>
|
||||||
<div class="status" id="s">Starting up...</div>
|
<div class="spinner" id="spin"></div>
|
||||||
|
<div class="status" id="st">Connecting to bee-web...</div>
|
||||||
|
<table id="tbl"></table>
|
||||||
|
<button class="btn" id="btn" onclick="go()">Open app now</button>
|
||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
function probe(){
|
(function(){
|
||||||
fetch('/api/ready',{cache:'no-store'})
|
var gone = false;
|
||||||
.then(function(r){
|
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
||||||
if(r.ok){window.location.replace('/');}
|
|
||||||
else{setTimeout(probe,1000);}
|
function icon(s){
|
||||||
|
if(s==='active') return '<span class="ok">● active</span>';
|
||||||
|
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
||||||
|
if(s==='activating'||s==='reloading') return '<span class="run">○ starting</span>';
|
||||||
|
if(s==='inactive') return '<span class="dim">○ inactive</span>';
|
||||||
|
return '<span class="dim">'+s+'</span>';
|
||||||
|
}
|
||||||
|
|
||||||
|
function allSettled(svcs){
|
||||||
|
for(var i=0;i<svcs.length;i++){
|
||||||
|
var s=svcs[i].state;
|
||||||
|
if(s!=='active'&&s!=='failed'&&s!=='inactive') return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var pollTimer=null;
|
||||||
|
|
||||||
|
function pollServices(){
|
||||||
|
fetch('/api/services',{cache:'no-store'})
|
||||||
|
.then(function(r){return r.json();})
|
||||||
|
.then(function(svcs){
|
||||||
|
if(!svcs||!svcs.length) return;
|
||||||
|
var tbl=document.getElementById('tbl');
|
||||||
|
tbl.style.display='';
|
||||||
|
var html='';
|
||||||
|
for(var i=0;i<svcs.length;i++)
|
||||||
|
html+='<tr><td>'+svcs[i].name+'</td><td>'+icon(svcs[i].state)+'</td></tr>';
|
||||||
|
tbl.innerHTML=html;
|
||||||
|
if(allSettled(svcs)){
|
||||||
|
clearInterval(pollTimer);
|
||||||
|
document.getElementById('spin').className='spinner hidden';
|
||||||
|
document.getElementById('st').textContent='Ready \u2014 opening...';
|
||||||
|
setTimeout(go,800);
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.catch(function(){setTimeout(probe,1000);});
|
.catch(function(){});
|
||||||
|
}
|
||||||
|
|
||||||
|
function probe(){
|
||||||
|
fetch('/healthz',{cache:'no-store'})
|
||||||
|
.then(function(r){
|
||||||
|
if(r.ok){
|
||||||
|
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
||||||
|
document.getElementById('btn').style.display='';
|
||||||
|
pollServices();
|
||||||
|
pollTimer=setInterval(pollServices,1500);
|
||||||
|
} else {
|
||||||
|
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
||||||
|
setTimeout(probe,500);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(function(){
|
||||||
|
document.getElementById('st').textContent='Waiting for bee-web to start...';
|
||||||
|
setTimeout(probe,500);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
probe();
|
probe();
|
||||||
|
})();
|
||||||
</script>
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>`
|
</html>`
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
@@ -34,6 +35,49 @@ func TestChartLegendNumber(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRecoverMiddlewareReturns500OnPanic(t *testing.T) {
|
||||||
|
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
panic("boom")
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/panic", nil)
|
||||||
|
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusInternalServerError {
|
||||||
|
t.Fatalf("status=%d want %d", rec.Code, http.StatusInternalServerError)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rec.Body.String(), "internal server error") {
|
||||||
|
t.Fatalf("body=%q", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRecoverMiddlewarePreservesStreamingInterfaces(t *testing.T) {
|
||||||
|
handler := recoverMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if !sseStart(w) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !sseWrite(w, "tick", "ok") {
|
||||||
|
t.Fatal("expected sse write to succeed")
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/stream", nil)
|
||||||
|
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if got := rec.Header().Get("Content-Type"); got != "text/event-stream" {
|
||||||
|
t.Fatalf("content-type=%q", got)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, "event: tick\n") || !strings.Contains(body, "data: ok\n\n") {
|
||||||
|
t.Fatalf("body=%q", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
samples := []platform.LiveMetricSample{
|
samples := []platform.LiveMetricSample{
|
||||||
{
|
{
|
||||||
@@ -136,6 +180,39 @@ func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1400},
|
||||||
|
{GPUIndex: 3, ClockMHz: 1500},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, ClockMHz: 1410},
|
||||||
|
{GPUIndex: 3, ClockMHz: 1510},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-clock", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("gpu-all-clock returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Core Clock" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if got := datasets[1][1]; got != 1510 {
|
||||||
|
t.Fatalf("GPU 3 core clock=%v want 1510", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||||
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||||
want := []float64{0, 480, 480, 480, 510, 510}
|
want := []float64{0, 480, 480, 480, 510, 510}
|
||||||
@@ -157,6 +234,21 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
|||||||
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||||
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||||
}
|
}
|
||||||
|
if !strings.Contains(body, `id="gpu-metrics-section" style="display:none`) {
|
||||||
|
t.Fatalf("metrics page should keep gpu charts in a hidden dedicated section until GPUs are detected: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="gpu-chart-toggle"`) {
|
||||||
|
t.Fatalf("metrics page should render GPU chart mode toggle: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
|
||||||
|
t.Fatalf("metrics page should include GPU core clock chart: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
||||||
|
t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
|
||||||
|
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChartLegendVisible(t *testing.T) {
|
func TestChartLegendVisible(t *testing.T) {
|
||||||
@@ -199,6 +291,124 @@ func TestChartCanvasHeight(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartTimelineSegmentsForRangeMergesActiveSpansAndIdleGaps(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
end := start.Add(10 * time.Minute)
|
||||||
|
taskWindow := func(offsetStart, offsetEnd time.Duration) Task {
|
||||||
|
s := start.Add(offsetStart)
|
||||||
|
e := start.Add(offsetEnd)
|
||||||
|
return Task{
|
||||||
|
Name: "task",
|
||||||
|
Status: TaskDone,
|
||||||
|
StartedAt: &s,
|
||||||
|
DoneAt: &e,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
segments := chartTimelineSegmentsForRange(start, end, end, []Task{
|
||||||
|
taskWindow(1*time.Minute, 3*time.Minute),
|
||||||
|
taskWindow(2*time.Minute, 5*time.Minute),
|
||||||
|
taskWindow(7*time.Minute, 8*time.Minute),
|
||||||
|
})
|
||||||
|
if len(segments) != 5 {
|
||||||
|
t.Fatalf("segments=%d want 5: %#v", len(segments), segments)
|
||||||
|
}
|
||||||
|
wantActive := []bool{false, true, false, true, false}
|
||||||
|
wantMinutes := [][2]int{{0, 1}, {1, 5}, {5, 7}, {7, 8}, {8, 10}}
|
||||||
|
for i, segment := range segments {
|
||||||
|
if segment.Active != wantActive[i] {
|
||||||
|
t.Fatalf("segment[%d].Active=%v want %v", i, segment.Active, wantActive[i])
|
||||||
|
}
|
||||||
|
if got := int(segment.Start.Sub(start).Minutes()); got != wantMinutes[i][0] {
|
||||||
|
t.Fatalf("segment[%d] start=%d want %d", i, got, wantMinutes[i][0])
|
||||||
|
}
|
||||||
|
if got := int(segment.End.Sub(start).Minutes()); got != wantMinutes[i][1] {
|
||||||
|
t.Fatalf("segment[%d] end=%d want %d", i, got, wantMinutes[i][1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderMetricChartSVGIncludesTimelineOverlay(t *testing.T) {
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
labels := []string{"12:00", "12:01", "12:02"}
|
||||||
|
times := []time.Time{start, start.Add(time.Minute), start.Add(2 * time.Minute)}
|
||||||
|
svg, err := renderMetricChartSVG(
|
||||||
|
"System Power",
|
||||||
|
labels,
|
||||||
|
times,
|
||||||
|
[][]float64{{300, 320, 310}},
|
||||||
|
[]string{"Power W"},
|
||||||
|
floatPtr(0),
|
||||||
|
floatPtr(400),
|
||||||
|
360,
|
||||||
|
[]chartTimelineSegment{
|
||||||
|
{Start: start, End: start.Add(time.Minute), Active: false},
|
||||||
|
{Start: start.Add(time.Minute), End: start.Add(2 * time.Minute), Active: true},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
body := string(svg)
|
||||||
|
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||||
|
t.Fatalf("svg missing timeline overlay: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `opacity="0.10"`) {
|
||||||
|
t.Fatalf("svg missing idle overlay opacity: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `System Power`) {
|
||||||
|
t.Fatalf("svg missing chart title: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleMetricsChartSVGRendersCustomSVG(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
db, err := openMetricsDB(filepath.Join(dir, "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { _ = db.db.Close() })
|
||||||
|
|
||||||
|
start := time.Date(2026, 4, 5, 12, 0, 0, 0, time.UTC)
|
||||||
|
for i, sample := range []platform.LiveMetricSample{
|
||||||
|
{Timestamp: start, PowerW: 300},
|
||||||
|
{Timestamp: start.Add(time.Minute), PowerW: 320},
|
||||||
|
{Timestamp: start.Add(2 * time.Minute), PowerW: 310},
|
||||||
|
} {
|
||||||
|
if err := db.Write(sample); err != nil {
|
||||||
|
t.Fatalf("write sample %d: %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
prevTasks := globalQueue.tasks
|
||||||
|
s := start.Add(30 * time.Second)
|
||||||
|
e := start.Add(90 * time.Second)
|
||||||
|
globalQueue.tasks = []*Task{{Name: "Burn", Status: TaskDone, StartedAt: &s, DoneAt: &e}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = prevTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{ExportDir: dir}, metricsDB: db}
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/metrics/chart/server-power.svg", nil)
|
||||||
|
h.handleMetricsChartSVG(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `data-role="timeline-overlay"`) {
|
||||||
|
t.Fatalf("custom svg response missing timeline overlay: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `stroke-linecap="round"`) {
|
||||||
|
t.Fatalf("custom svg response missing custom polyline styling: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||||
@@ -212,21 +422,6 @@ func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChartYAxisOption(t *testing.T) {
|
|
||||||
min := floatPtr(0)
|
|
||||||
max := floatPtr(100)
|
|
||||||
opt := chartYAxisOption(min, max)
|
|
||||||
if opt.Min != min || opt.Max != max {
|
|
||||||
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
|
||||||
}
|
|
||||||
if opt.LabelCount != 11 {
|
|
||||||
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
|
||||||
}
|
|
||||||
if got := opt.ValueFormatter(1000); got != "1к" {
|
|
||||||
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||||
r1 := newMetricsRing(4)
|
r1 := newMetricsRing(4)
|
||||||
r2 := newMetricsRing(4)
|
r2 := newMetricsRing(4)
|
||||||
@@ -275,9 +470,10 @@ func TestRootRendersDashboard(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
handler := NewHandler(HandlerOptions{
|
handler := NewHandler(HandlerOptions{
|
||||||
Title: "Bee Hardware Audit",
|
Title: "Bee Hardware Audit",
|
||||||
AuditPath: path,
|
BuildLabel: "1.2.3",
|
||||||
ExportDir: exportDir,
|
AuditPath: path,
|
||||||
|
ExportDir: exportDir,
|
||||||
})
|
})
|
||||||
|
|
||||||
first := httptest.NewRecorder()
|
first := httptest.NewRecorder()
|
||||||
@@ -292,6 +488,11 @@ func TestRootRendersDashboard(t *testing.T) {
|
|||||||
if !strings.Contains(first.Body.String(), `/viewer`) {
|
if !strings.Contains(first.Body.String(), `/viewer`) {
|
||||||
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
t.Fatalf("first body missing viewer link: %s", first.Body.String())
|
||||||
}
|
}
|
||||||
|
versionIdx := strings.Index(first.Body.String(), `Version 1.2.3`)
|
||||||
|
navIdx := strings.Index(first.Body.String(), `href="/"`)
|
||||||
|
if versionIdx == -1 || navIdx == -1 || versionIdx > navIdx {
|
||||||
|
t.Fatalf("version should render near top of sidebar before nav links: %s", first.Body.String())
|
||||||
|
}
|
||||||
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
if got := first.Header().Get("Cache-Control"); got != "no-store" {
|
||||||
t.Fatalf("first cache-control=%q", got)
|
t.Fatalf("first cache-control=%q", got)
|
||||||
}
|
}
|
||||||
@@ -329,7 +530,7 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
|||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
if !strings.Contains(body, `Run Audit`) {
|
if !strings.Contains(body, `onclick="auditModalRun()">Run audit</button>`) {
|
||||||
t.Fatalf("dashboard missing run audit button: %s", body)
|
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||||
}
|
}
|
||||||
if strings.Contains(body, `No audit data`) {
|
if strings.Contains(body, `No audit data`) {
|
||||||
@@ -337,6 +538,18 @@ func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/api/ready", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(rec.Body.String()) != "ready" {
|
||||||
|
t.Fatalf("body=%q want ready", rec.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -359,7 +572,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||||
@@ -367,8 +580,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
|||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
if !strings.Contains(body, `id="task-log-overlay"`) {
|
if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
|
||||||
t.Fatalf("tasks page missing log modal overlay: %s", body)
|
t.Fatalf("tasks page missing task report hint: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||||
@@ -389,12 +602,295 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
|||||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
if !strings.Contains(body, `restartGPUDrivers()`) {
|
||||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("tools page missing boot source field: %s", body)
|
||||||
}
|
}
|
||||||
|
if !strings.Contains(body, `Export to USB`) {
|
||||||
|
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||||
|
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`href="/benchmark"`,
|
||||||
|
`id="benchmark-gpu-list"`,
|
||||||
|
`/api/gpu/nvidia`,
|
||||||
|
`/api/benchmark/nvidia/run`,
|
||||||
|
`benchmark-run-nccl`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Index: 1,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1168.50,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Benchmark Results`,
|
||||||
|
`Composite score by saved benchmark run and GPU.`,
|
||||||
|
`NVIDIA H100 PCIe / GPU 0`,
|
||||||
|
`NVIDIA H100 PCIe / GPU 1`,
|
||||||
|
`#1`,
|
||||||
|
wantTime,
|
||||||
|
`1176.25`,
|
||||||
|
`1168.50`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA GPU Targeted Stress`,
|
||||||
|
`nvidia-targeted-stress`,
|
||||||
|
`controlled NVIDIA DCGM load`,
|
||||||
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
|
`NVIDIA GPU Selection`,
|
||||||
|
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
||||||
|
`Select All`,
|
||||||
|
`id="sat-gpu-list"`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`NVIDIA Max Compute Load`,
|
||||||
|
`dcgmproftester`,
|
||||||
|
`targeted_stress remain in <a href="/validate">Validate</a>`,
|
||||||
|
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
|
||||||
|
`id="burn-gpu-list"`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("burn page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskDetailPageRendersSavedReport(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
|
||||||
|
if err := os.MkdirAll(reportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
reportPath := filepath.Join(reportDir, "report.html")
|
||||||
|
if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
ArtifactsDir: reportDir,
|
||||||
|
ReportHTMLPath: reportPath,
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
|
||||||
|
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `saved report`) {
|
||||||
|
t.Fatalf("task detail page missing saved report: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `Back to Tasks`) {
|
||||||
|
t.Fatalf("task detail page missing back link: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-live-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, `Cancel</button>`) {
|
||||||
|
t.Fatalf("task detail page missing cancel button: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `function cancelTaskDetail(id)`) {
|
||||||
|
t.Fatalf("task detail page missing cancel handler: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
|
||||||
|
t.Fatalf("task detail page missing cancel endpoint: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `id="task-live-charts"`) {
|
||||||
|
t.Fatalf("task detail page missing live charts container: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
|
||||||
|
t.Fatalf("task detail page missing live charts index endpoint: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
db, err := openMetricsDB(metricsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
base := time.Now().UTC()
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
|
||||||
|
{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
|
||||||
|
{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
|
||||||
|
}
|
||||||
|
for _, sample := range samples {
|
||||||
|
if err := db.Write(sample); err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = db.Close()
|
||||||
|
|
||||||
|
started := base.Add(-2*time.Minute - 5*time.Second)
|
||||||
|
done := base.Add(-1*time.Minute + 5*time.Second)
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = []*Task{{
|
||||||
|
ID: "task-chart-1",
|
||||||
|
Name: "Power Window",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: started.Add(-10 * time.Second),
|
||||||
|
StartedAt: &started,
|
||||||
|
DoneAt: &done,
|
||||||
|
}}
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
|
||||||
|
req.SetPathValue("id", "task-chart-1")
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, req)
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
if !strings.Contains(body, "System Power") {
|
||||||
|
t.Fatalf("task chart missing expected title: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "min 200") {
|
||||||
|
t.Fatalf("task chart stats should start from in-window sample: %s", body)
|
||||||
|
}
|
||||||
|
if strings.Contains(body, "min 100") {
|
||||||
|
t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||||
@@ -518,3 +1014,98 @@ func TestRuntimeHealthEndpointReturnsJSON(t *testing.T) {
|
|||||||
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
t.Fatalf("body=%q want %q", strings.TrimSpace(rec.Body.String()), body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z","hardware":{"board":{"serial_number":"SERIAL-1"}}}`), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
health := `{
|
||||||
|
"status":"PARTIAL",
|
||||||
|
"checked_at":"2026-03-16T10:00:00Z",
|
||||||
|
"export_dir":"/tmp/export",
|
||||||
|
"driver_ready":true,
|
||||||
|
"cuda_ready":false,
|
||||||
|
"network_status":"PARTIAL",
|
||||||
|
"issues":[
|
||||||
|
{"code":"dhcp_partial","description":"At least one interface did not obtain IPv4 connectivity."},
|
||||||
|
{"code":"cuda_runtime_not_ready","description":"CUDA runtime is not ready for GPU SAT."}
|
||||||
|
],
|
||||||
|
"tools":[
|
||||||
|
{"name":"dmidecode","ok":true},
|
||||||
|
{"name":"nvidia-smi","ok":false}
|
||||||
|
],
|
||||||
|
"services":[
|
||||||
|
{"name":"bee-web","status":"active"},
|
||||||
|
{"name":"bee-nvidia","status":"inactive"}
|
||||||
|
]
|
||||||
|
}`
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
componentStatus := `[
|
||||||
|
{
|
||||||
|
"component_key":"cpu:all",
|
||||||
|
"status":"Warning",
|
||||||
|
"error_summary":"cpu SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:00:00Z","status":"Warning","source":"sat:cpu","detail":"cpu SAT: FAILED"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"memory:all",
|
||||||
|
"status":"OK",
|
||||||
|
"history":[{"at":"2026-03-16T10:01:00Z","status":"OK","source":"sat:memory","detail":"memory SAT: OK"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"storage:nvme0n1",
|
||||||
|
"status":"Critical",
|
||||||
|
"error_summary":"storage SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:02:00Z","status":"Critical","source":"sat:storage","detail":"storage SAT: FAILED"}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"component_key":"pcie:gpu:nvidia",
|
||||||
|
"status":"Warning",
|
||||||
|
"error_summary":"nvidia SAT: FAILED",
|
||||||
|
"history":[{"at":"2026-03-16T10:03:00Z","status":"Warning","source":"sat:nvidia","detail":"nvidia SAT: FAILED"}]
|
||||||
|
}
|
||||||
|
]`
|
||||||
|
if err := os.WriteFile(filepath.Join(exportDir, "component-status.json"), []byte(componentStatus), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{AuditPath: path, ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Runtime Health`,
|
||||||
|
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||||
|
`Export Directory`,
|
||||||
|
`Network`,
|
||||||
|
`NVIDIA/AMD Driver`,
|
||||||
|
`CUDA / ROCm`,
|
||||||
|
`Required Utilities`,
|
||||||
|
`Bee Services`,
|
||||||
|
`<td>CPU</td>`,
|
||||||
|
`<td>Memory</td>`,
|
||||||
|
`<td>Storage</td>`,
|
||||||
|
`<td>GPU</td>`,
|
||||||
|
`CUDA runtime is not ready for GPU SAT.`,
|
||||||
|
`Missing: nvidia-smi`,
|
||||||
|
`bee-nvidia=inactive`,
|
||||||
|
`cpu SAT: FAILED`,
|
||||||
|
`storage SAT: FAILED`,
|
||||||
|
`sat:nvidia`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
42
audit/internal/webui/stability.go
Normal file
42
audit/internal/webui/stability.go
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"runtime/debug"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
func goRecoverLoop(name string, restartDelay time.Duration, fn func()) {
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
if !runRecoverable(name, fn) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if restartDelay > 0 {
|
||||||
|
time.Sleep(restartDelay)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func goRecoverOnce(name string, fn func()) {
|
||||||
|
go func() {
|
||||||
|
_ = runRecoverable(name, fn)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func runRecoverable(name string, fn func()) (panicked bool) {
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
panicked = true
|
||||||
|
slog.Error("recovered panic",
|
||||||
|
"component", name,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
fn()
|
||||||
|
return false
|
||||||
|
}
|
||||||
267
audit/internal/webui/task_page.go
Normal file
267
audit/internal/webui/task_page.go
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
task, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
snapshot := *task
|
||||||
|
body := renderTaskDetailPage(h.opts, snapshot)
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
_, _ = w.Write([]byte(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
|
||||||
|
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
type taskChartIndexEntry struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
File string `json:"file"`
|
||||||
|
}
|
||||||
|
entries := make([]taskChartIndexEntry, 0)
|
||||||
|
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||||
|
title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
_ = json.NewEncoder(w).Encode(entries)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||||
|
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
|
||||||
|
path, ok := taskChartPathFromFile(file)
|
||||||
|
if !ok {
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
|
||||||
|
if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
|
||||||
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||||
|
title := task.Name
|
||||||
|
if strings.TrimSpace(title) == "" {
|
||||||
|
title = task.ID
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||||
|
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||||
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
|
body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||||
|
body.WriteString(`</div>`)
|
||||||
|
|
||||||
|
if report := loadTaskReportFragment(task); report != "" {
|
||||||
|
body.WriteString(report)
|
||||||
|
} else {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
|
||||||
|
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
|
||||||
|
if strings.TrimSpace(task.ErrMsg) != "" {
|
||||||
|
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
|
||||||
|
}
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if task.Status == TaskRunning {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
|
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||||
|
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||||
|
body.WriteString(`</div></div>`)
|
||||||
|
body.WriteString(`<script>
|
||||||
|
function cancelTaskDetail(id) {
|
||||||
|
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||||
|
var term = document.getElementById('task-live-log');
|
||||||
|
if (term) {
|
||||||
|
term.textContent += '\nCancel requested.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function renderTaskLiveCharts(taskId, charts) {
|
||||||
|
const host = document.getElementById('task-live-charts');
|
||||||
|
if (!host) return;
|
||||||
|
if (!Array.isArray(charts) || charts.length === 0) {
|
||||||
|
host.innerHTML = 'Waiting for metric samples...';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const seen = {};
|
||||||
|
charts.forEach(function(chart) {
|
||||||
|
seen[chart.file] = true;
|
||||||
|
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||||
|
if (img) {
|
||||||
|
const card = img.closest('.card');
|
||||||
|
if (card) {
|
||||||
|
const title = card.querySelector('.card-head');
|
||||||
|
if (title) title.textContent = chart.title;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const card = document.createElement('div');
|
||||||
|
card.className = 'card';
|
||||||
|
card.style.margin = '0';
|
||||||
|
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||||
|
card.querySelector('.card-head').textContent = chart.title;
|
||||||
|
const body = card.querySelector('.card-body');
|
||||||
|
img = document.createElement('img');
|
||||||
|
img.setAttribute('data-task-chart', '1');
|
||||||
|
img.setAttribute('data-chart-file', chart.file);
|
||||||
|
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||||
|
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||||
|
img.style.width = '100%';
|
||||||
|
img.style.display = 'block';
|
||||||
|
img.style.borderRadius = '6px';
|
||||||
|
img.alt = chart.title;
|
||||||
|
body.appendChild(img);
|
||||||
|
host.appendChild(card);
|
||||||
|
});
|
||||||
|
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||||
|
const file = img.getAttribute('data-chart-file') || '';
|
||||||
|
if (seen[file]) return;
|
||||||
|
const card = img.closest('.card');
|
||||||
|
if (card) card.remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadTaskLiveCharts(taskId) {
|
||||||
|
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||||
|
renderTaskLiveCharts(taskId, charts);
|
||||||
|
}).catch(function(){
|
||||||
|
const host = document.getElementById('task-live-charts');
|
||||||
|
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function refreshTaskLiveCharts() {
|
||||||
|
document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
|
||||||
|
const base = img.dataset.baseSrc;
|
||||||
|
if (!base) return;
|
||||||
|
img.src = base + '?t=' + Date.now();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||||
|
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||||
|
var _taskChartTimer = null;
|
||||||
|
var _taskChartsFrozen = false;
|
||||||
|
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||||
|
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||||
|
_taskDetailES.addEventListener('done', function(e){
|
||||||
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
_taskChartsFrozen = true;
|
||||||
|
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||||
|
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
});
|
||||||
|
_taskDetailES.onerror = function(){
|
||||||
|
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||||
|
if (_taskDetailES) {
|
||||||
|
_taskDetailES.close();
|
||||||
|
_taskDetailES = null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
|
_taskChartTimer = setInterval(function(){
|
||||||
|
if (_taskChartsFrozen) return;
|
||||||
|
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||||
|
refreshTaskLiveCharts();
|
||||||
|
}, 2000);
|
||||||
|
</script>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
return layoutHead(opts.Title+" — "+title) +
|
||||||
|
layoutNav("tasks", opts.BuildLabel) +
|
||||||
|
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||||
|
body.String() +
|
||||||
|
`</div></div></body></html>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadTaskReportFragment(task Task) string {
|
||||||
|
if strings.TrimSpace(task.ReportHTMLPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
data, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil || len(data) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return string(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArtifactDownloadLink(task Task, absPath string) string {
|
||||||
|
if strings.TrimSpace(absPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
|
||||||
|
id := r.PathValue("id")
|
||||||
|
taskPtr, ok := globalQueue.findByID(id)
|
||||||
|
if !ok {
|
||||||
|
return Task{}, nil, time.Time{}, time.Time{}, false
|
||||||
|
}
|
||||||
|
task := *taskPtr
|
||||||
|
start, end := taskTimeWindow(&task)
|
||||||
|
samples, err := loadTaskMetricSamples(start, end)
|
||||||
|
if err != nil {
|
||||||
|
return task, nil, start, end, true
|
||||||
|
}
|
||||||
|
return task, samples, start, end, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskTimelineForTask(task Task) []chartTimelineSegment {
|
||||||
|
start, end := taskTimeWindow(&task)
|
||||||
|
return []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskChartPathFromFile(file string) (string, bool) {
|
||||||
|
file = strings.TrimSpace(file)
|
||||||
|
for _, spec := range taskDashboardChartSpecs {
|
||||||
|
if spec.File == file {
|
||||||
|
return spec.Path, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
|
||||||
|
id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
|
||||||
|
return "gpu/" + id + "-overview", true
|
||||||
|
}
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
343
audit/internal/webui/task_report.go
Normal file
343
audit/internal/webui/task_report.go
Normal file
@@ -0,0 +1,343 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"html"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
var taskReportMetricsDBPath = metricsDBPath
|
||||||
|
|
||||||
|
type taskReport struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Target string `json:"target"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
|
DurationSec int `json:"duration_sec,omitempty"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
LogFile string `json:"log_file,omitempty"`
|
||||||
|
Charts []taskReportChart `json:"charts,omitempty"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskReportChart struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
File string `json:"file"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskChartSpec struct {
|
||||||
|
Path string
|
||||||
|
File string
|
||||||
|
}
|
||||||
|
|
||||||
|
var taskDashboardChartSpecs = []taskChartSpec{
|
||||||
|
{Path: "server-load", File: "server-load.svg"},
|
||||||
|
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
|
||||||
|
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
|
||||||
|
{Path: "server-power", File: "server-power.svg"},
|
||||||
|
{Path: "server-fans", File: "server-fans.svg"},
|
||||||
|
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
|
||||||
|
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
|
||||||
|
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
|
||||||
|
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
|
||||||
|
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
|
||||||
|
specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
|
||||||
|
specs = append(specs, taskDashboardChartSpecs...)
|
||||||
|
for _, idx := range taskGPUIndices(samples) {
|
||||||
|
specs = append(specs, taskChartSpec{
|
||||||
|
Path: fmt.Sprintf("gpu/%d-overview", idx),
|
||||||
|
File: fmt.Sprintf("gpu-%d-overview.svg", idx),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return specs
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskReportArtifacts(t *Task) error {
|
||||||
|
if t == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
start, end := taskTimeWindow(t)
|
||||||
|
samples, _ := loadTaskMetricSamples(start, end)
|
||||||
|
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
|
||||||
|
|
||||||
|
logText := ""
|
||||||
|
if data, err := os.ReadFile(t.LogPath); err == nil {
|
||||||
|
logText = string(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
report := taskReport{
|
||||||
|
ID: t.ID,
|
||||||
|
Name: t.Name,
|
||||||
|
Target: t.Target,
|
||||||
|
Status: t.Status,
|
||||||
|
CreatedAt: t.CreatedAt,
|
||||||
|
StartedAt: t.StartedAt,
|
||||||
|
DoneAt: t.DoneAt,
|
||||||
|
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
|
||||||
|
Error: t.ErrMsg,
|
||||||
|
LogFile: filepath.Base(t.LogPath),
|
||||||
|
Charts: charts,
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func reportDoneTime(t *Task) time.Time {
|
||||||
|
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
return *t.DoneAt
|
||||||
|
}
|
||||||
|
return time.Now()
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskTimeWindow(t *Task) (time.Time, time.Time) {
|
||||||
|
if t == nil {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
return now, now
|
||||||
|
}
|
||||||
|
start := t.CreatedAt.UTC()
|
||||||
|
if t.StartedAt != nil && !t.StartedAt.IsZero() {
|
||||||
|
start = t.StartedAt.UTC()
|
||||||
|
}
|
||||||
|
end := time.Now().UTC()
|
||||||
|
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
end = t.DoneAt.UTC()
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
end = start
|
||||||
|
}
|
||||||
|
return start, end
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||||
|
db, err := openMetricsDB(taskReportMetricsDBPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
return db.LoadBetween(start, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||||
|
var charts []taskReportChart
|
||||||
|
inline := make(map[string]string)
|
||||||
|
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||||
|
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||||
|
if !ok || len(svg) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := filepath.Join(dir, spec.File)
|
||||||
|
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||||
|
inline[spec.File] = string(svg)
|
||||||
|
}
|
||||||
|
return charts, inline
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||||
|
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||||
|
buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||||
|
if err != nil || !hasData {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||||
|
}
|
||||||
|
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||||
|
if !ok {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
buf, err := renderMetricChartSVG(
|
||||||
|
title,
|
||||||
|
labels,
|
||||||
|
sampleTimes(samples),
|
||||||
|
datasets,
|
||||||
|
names,
|
||||||
|
yMin,
|
||||||
|
yMax,
|
||||||
|
chartCanvasHeightForPath(path, len(names)),
|
||||||
|
timeline,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, false
|
||||||
|
}
|
||||||
|
return title, buf, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
|
||||||
|
seen := map[int]bool{}
|
||||||
|
var out []int
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if seen[g.GPUIndex] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[g.GPUIndex] = true
|
||||||
|
out = append(out, g.GPUIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Ints(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJSONFile(path string, v any) error {
|
||||||
|
data, err := json.MarshalIndent(v, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.WriteFile(path, data, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
|
||||||
|
b.WriteString(`<div class="grid2">`)
|
||||||
|
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
|
||||||
|
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
|
||||||
|
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
|
||||||
|
if strings.TrimSpace(report.Error) != "" {
|
||||||
|
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||||
|
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||||
|
b.WriteString(`</div></div></div>`)
|
||||||
|
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||||
|
b.WriteString(benchmarkCard)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(report.Charts) > 0 {
|
||||||
|
for _, chart := range report.Charts {
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||||
|
b.WriteString(charts[chart.File])
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
|
||||||
|
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
|
||||||
|
b.WriteString(`</div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||||
|
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Benchmark Results",
|
||||||
|
"Composite score for this benchmark task.",
|
||||||
|
"No benchmark results were saved for this task.",
|
||||||
|
columns,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskBenchmarkResultPath(logText string) string {
|
||||||
|
archivePath := taskArchivePathFromLog(logText)
|
||||||
|
if archivePath == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
if runDir == archivePath {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return filepath.Join(runDir, "result.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArchivePathFromLog(logText string) string {
|
||||||
|
lines := strings.Split(logText, "\n")
|
||||||
|
for i := len(lines) - 1; i >= 0; i-- {
|
||||||
|
line := strings.TrimSpace(lines[i])
|
||||||
|
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||||
|
if strings.HasPrefix(path, "Archive written to ") {
|
||||||
|
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(path, ".tar.gz") {
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderTaskStatusBadge(status string) string {
|
||||||
|
className := map[string]string{
|
||||||
|
TaskRunning: "badge-ok",
|
||||||
|
TaskPending: "badge-unknown",
|
||||||
|
TaskDone: "badge-ok",
|
||||||
|
TaskFailed: "badge-err",
|
||||||
|
TaskCancelled: "badge-unknown",
|
||||||
|
}[status]
|
||||||
|
if className == "" {
|
||||||
|
className = "badge-unknown"
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(status)
|
||||||
|
if label == "" {
|
||||||
|
label = "unknown"
|
||||||
|
}
|
||||||
|
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatTaskTime(ts *time.Time, fallback time.Time) string {
|
||||||
|
if ts != nil && !ts.IsZero() {
|
||||||
|
return ts.Local().Format("2006-01-02 15:04:05")
|
||||||
|
}
|
||||||
|
if !fallback.IsZero() {
|
||||||
|
return fallback.Local().Format("2006-01-02 15:04:05")
|
||||||
|
}
|
||||||
|
return "n/a"
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatTaskDuration(sec int) string {
|
||||||
|
if sec <= 0 {
|
||||||
|
return "n/a"
|
||||||
|
}
|
||||||
|
if sec < 60 {
|
||||||
|
return fmt.Sprintf("%ds", sec)
|
||||||
|
}
|
||||||
|
if sec < 3600 {
|
||||||
|
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||||
|
}
|
||||||
@@ -4,10 +4,12 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"runtime/debug"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -28,22 +30,29 @@ const (
|
|||||||
|
|
||||||
// taskNames maps target → human-readable name for validate (SAT) runs.
|
// taskNames maps target → human-readable name for validate (SAT) runs.
|
||||||
var taskNames = map[string]string{
|
var taskNames = map[string]string{
|
||||||
"nvidia": "NVIDIA SAT",
|
"nvidia": "NVIDIA SAT",
|
||||||
"nvidia-stress": "NVIDIA GPU Stress",
|
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
|
||||||
"memory": "Memory SAT",
|
"nvidia-benchmark": "NVIDIA Benchmark",
|
||||||
"storage": "Storage SAT",
|
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
|
||||||
"cpu": "CPU SAT",
|
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
|
||||||
"amd": "AMD GPU SAT",
|
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
|
||||||
"amd-mem": "AMD GPU MEM Integrity",
|
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
|
||||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
|
||||||
"amd-stress": "AMD GPU Burn-in",
|
"nvidia-stress": "NVIDIA GPU Stress",
|
||||||
"memory-stress": "Memory Burn-in",
|
"memory": "Memory SAT",
|
||||||
"sat-stress": "SAT Stress (stressapptest)",
|
"storage": "Storage SAT",
|
||||||
"platform-stress": "Platform Thermal Cycling",
|
"cpu": "CPU SAT",
|
||||||
"audit": "Audit",
|
"amd": "AMD GPU SAT",
|
||||||
"support-bundle": "Support Bundle",
|
"amd-mem": "AMD GPU MEM Integrity",
|
||||||
"install": "Install to Disk",
|
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||||
"install-to-ram": "Install to RAM",
|
"amd-stress": "AMD GPU Burn-in",
|
||||||
|
"memory-stress": "Memory Burn-in",
|
||||||
|
"sat-stress": "SAT Stress (stressapptest)",
|
||||||
|
"platform-stress": "Platform Thermal Cycling",
|
||||||
|
"audit": "Audit",
|
||||||
|
"support-bundle": "Support Bundle",
|
||||||
|
"install": "Install to Disk",
|
||||||
|
"install-to-ram": "Install to RAM",
|
||||||
}
|
}
|
||||||
|
|
||||||
// burnNames maps target → human-readable name when a burn profile is set.
|
// burnNames maps target → human-readable name when a burn profile is set.
|
||||||
@@ -83,17 +92,20 @@ func taskDisplayName(target, profile, loader string) string {
|
|||||||
|
|
||||||
// Task represents one unit of work in the queue.
|
// Task represents one unit of work in the queue.
|
||||||
type Task struct {
|
type Task struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Target string `json:"target"`
|
Target string `json:"target"`
|
||||||
Priority int `json:"priority"`
|
Priority int `json:"priority"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||||
ErrMsg string `json:"error,omitempty"`
|
ErrMsg string `json:"error,omitempty"`
|
||||||
LogPath string `json:"log_path,omitempty"`
|
LogPath string `json:"log_path,omitempty"`
|
||||||
|
ArtifactsDir string `json:"artifacts_dir,omitempty"`
|
||||||
|
ReportJSONPath string `json:"report_json_path,omitempty"`
|
||||||
|
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||||
|
|
||||||
// runtime fields (not serialised)
|
// runtime fields (not serialised)
|
||||||
job *jobState
|
job *jobState
|
||||||
@@ -106,80 +118,96 @@ type taskParams struct {
|
|||||||
DiagLevel int `json:"diag_level,omitempty"`
|
DiagLevel int `json:"diag_level,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
|
SizeMB int `json:"size_mb,omitempty"`
|
||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type persistedTask struct {
|
type persistedTask struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Target string `json:"target"`
|
Target string `json:"target"`
|
||||||
Priority int `json:"priority"`
|
Priority int `json:"priority"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
ErrMsg string `json:"error,omitempty"`
|
ErrMsg string `json:"error,omitempty"`
|
||||||
LogPath string `json:"log_path,omitempty"`
|
LogPath string `json:"log_path,omitempty"`
|
||||||
Params taskParams `json:"params,omitempty"`
|
ArtifactsDir string `json:"artifacts_dir,omitempty"`
|
||||||
|
ReportJSONPath string `json:"report_json_path,omitempty"`
|
||||||
|
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||||
|
Params taskParams `json:"params,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type burnPreset struct {
|
type burnPreset struct {
|
||||||
NvidiaDiag int
|
|
||||||
DurationSec int
|
DurationSec int
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveBurnPreset(profile string) burnPreset {
|
func resolveBurnPreset(profile string) burnPreset {
|
||||||
switch profile {
|
switch profile {
|
||||||
case "overnight":
|
case "overnight":
|
||||||
return burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}
|
return burnPreset{DurationSec: 8 * 60 * 60}
|
||||||
case "acceptance":
|
case "acceptance":
|
||||||
return burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}
|
return burnPreset{DurationSec: 60 * 60}
|
||||||
default:
|
default:
|
||||||
return burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}
|
return burnPreset{DurationSec: 5 * 60}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
|
acceptanceCycles := []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 85, IdleSec: 5},
|
||||||
|
{LoadSec: 80, IdleSec: 10},
|
||||||
|
{LoadSec: 55, IdleSec: 5},
|
||||||
|
{LoadSec: 60, IdleSec: 0},
|
||||||
|
{LoadSec: 100, IdleSec: 10},
|
||||||
|
{LoadSec: 145, IdleSec: 15},
|
||||||
|
{LoadSec: 190, IdleSec: 20},
|
||||||
|
{LoadSec: 235, IdleSec: 25},
|
||||||
|
{LoadSec: 280, IdleSec: 30},
|
||||||
|
{LoadSec: 325, IdleSec: 35},
|
||||||
|
{LoadSec: 370, IdleSec: 40},
|
||||||
|
{LoadSec: 415, IdleSec: 45},
|
||||||
|
{LoadSec: 460, IdleSec: 50},
|
||||||
|
{LoadSec: 510, IdleSec: 0},
|
||||||
|
}
|
||||||
|
|
||||||
switch profile {
|
switch profile {
|
||||||
case "overnight":
|
case "overnight":
|
||||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
cycles := make([]platform.PlatformStressCycle, 0, len(acceptanceCycles)*8)
|
||||||
{LoadSec: 600, IdleSec: 120},
|
for range 8 {
|
||||||
{LoadSec: 600, IdleSec: 60},
|
cycles = append(cycles, acceptanceCycles...)
|
||||||
{LoadSec: 600, IdleSec: 30},
|
}
|
||||||
{LoadSec: 600, IdleSec: 120},
|
return platform.PlatformStressOptions{Cycles: cycles}
|
||||||
{LoadSec: 600, IdleSec: 60},
|
|
||||||
{LoadSec: 600, IdleSec: 30},
|
|
||||||
{LoadSec: 600, IdleSec: 120},
|
|
||||||
{LoadSec: 600, IdleSec: 60},
|
|
||||||
}}
|
|
||||||
case "acceptance":
|
case "acceptance":
|
||||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
return platform.PlatformStressOptions{Cycles: acceptanceCycles}
|
||||||
{LoadSec: 300, IdleSec: 60},
|
|
||||||
{LoadSec: 300, IdleSec: 30},
|
|
||||||
{LoadSec: 300, IdleSec: 60},
|
|
||||||
{LoadSec: 300, IdleSec: 30},
|
|
||||||
}}
|
|
||||||
default: // smoke
|
default: // smoke
|
||||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
{LoadSec: 90, IdleSec: 60},
|
{LoadSec: 85, IdleSec: 5},
|
||||||
{LoadSec: 90, IdleSec: 30},
|
{LoadSec: 80, IdleSec: 10},
|
||||||
|
{LoadSec: 55, IdleSec: 5},
|
||||||
|
{LoadSec: 60, IdleSec: 0},
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||||
type taskQueue struct {
|
type taskQueue struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
tasks []*Task
|
tasks []*Task
|
||||||
trigger chan struct{}
|
trigger chan struct{}
|
||||||
opts *HandlerOptions // set by startWorker
|
opts *HandlerOptions // set by startWorker
|
||||||
statePath string
|
statePath string
|
||||||
logsDir string
|
logsDir string
|
||||||
started bool
|
started bool
|
||||||
|
kmsgWatcher *kmsgWatcher
|
||||||
}
|
}
|
||||||
|
|
||||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||||
@@ -231,6 +259,7 @@ func (q *taskQueue) enqueue(t *Task) {
|
|||||||
q.prune()
|
q.prune()
|
||||||
q.persistLocked()
|
q.persistLocked()
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
|
taskSerialEvent(t, "queued")
|
||||||
select {
|
select {
|
||||||
case q.trigger <- struct{}{}:
|
case q.trigger <- struct{}{}:
|
||||||
default:
|
default:
|
||||||
@@ -376,7 +405,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
|||||||
if !q.started {
|
if !q.started {
|
||||||
q.loadLocked()
|
q.loadLocked()
|
||||||
q.started = true
|
q.started = true
|
||||||
go q.worker()
|
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||||
}
|
}
|
||||||
hasPending := q.nextPending() != nil
|
hasPending := q.nextPending() != nil
|
||||||
q.mu.Unlock()
|
q.mu.Unlock()
|
||||||
@@ -391,47 +420,101 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
|||||||
func (q *taskQueue) worker() {
|
func (q *taskQueue) worker() {
|
||||||
for {
|
for {
|
||||||
<-q.trigger
|
<-q.trigger
|
||||||
setCPUGovernor("performance")
|
func() {
|
||||||
for {
|
setCPUGovernor("performance")
|
||||||
q.mu.Lock()
|
defer setCPUGovernor("powersave")
|
||||||
t := q.nextPending()
|
|
||||||
if t == nil {
|
|
||||||
q.mu.Unlock()
|
|
||||||
break
|
|
||||||
}
|
|
||||||
now := time.Now()
|
|
||||||
t.Status = TaskRunning
|
|
||||||
t.StartedAt = &now
|
|
||||||
t.DoneAt = nil
|
|
||||||
t.ErrMsg = ""
|
|
||||||
j := newTaskJobState(t.LogPath)
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
j.cancel = cancel
|
|
||||||
t.job = j
|
|
||||||
q.persistLocked()
|
|
||||||
q.mu.Unlock()
|
|
||||||
|
|
||||||
q.runTask(t, j, ctx)
|
for {
|
||||||
|
q.mu.Lock()
|
||||||
q.mu.Lock()
|
t := q.nextPending()
|
||||||
now2 := time.Now()
|
if t == nil {
|
||||||
t.DoneAt = &now2
|
q.prune()
|
||||||
if t.Status == TaskRunning { // not cancelled externally
|
q.persistLocked()
|
||||||
if j.err != "" {
|
q.mu.Unlock()
|
||||||
t.Status = TaskFailed
|
return
|
||||||
t.ErrMsg = j.err
|
|
||||||
} else {
|
|
||||||
t.Status = TaskDone
|
|
||||||
}
|
}
|
||||||
|
now := time.Now()
|
||||||
|
t.Status = TaskRunning
|
||||||
|
t.StartedAt = &now
|
||||||
|
t.DoneAt = nil
|
||||||
|
t.ErrMsg = ""
|
||||||
|
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||||
|
t.job = j
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
|
||||||
|
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||||
|
j.cancel = taskCancel
|
||||||
|
q.executeTask(t, j, taskCtx)
|
||||||
|
taskCancel()
|
||||||
|
|
||||||
|
q.mu.Lock()
|
||||||
|
q.prune()
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
}
|
}
|
||||||
q.prune()
|
}()
|
||||||
q.persistLocked()
|
|
||||||
q.mu.Unlock()
|
|
||||||
}
|
|
||||||
setCPUGovernor("powersave")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
||||||
|
startedKmsgWatch := false
|
||||||
|
defer q.finalizeTaskRun(t, j)
|
||||||
|
defer func() {
|
||||||
|
if startedKmsgWatch && q.kmsgWatcher != nil {
|
||||||
|
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
defer func() {
|
||||||
|
if rec := recover(); rec != nil {
|
||||||
|
msg := fmt.Sprintf("task panic: %v", rec)
|
||||||
|
slog.Error("task panic",
|
||||||
|
"task_id", t.ID,
|
||||||
|
"target", t.Target,
|
||||||
|
"panic", fmt.Sprint(rec),
|
||||||
|
"stack", string(debug.Stack()),
|
||||||
|
)
|
||||||
|
j.append("ERROR: " + msg)
|
||||||
|
j.finish(msg)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||||
|
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||||
|
startedKmsgWatch = true
|
||||||
|
}
|
||||||
|
|
||||||
|
q.runTask(t, j, ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
|
||||||
|
q.mu.Lock()
|
||||||
|
now := time.Now()
|
||||||
|
t.DoneAt = &now
|
||||||
|
if t.Status == TaskRunning {
|
||||||
|
if j.err != "" {
|
||||||
|
t.Status = TaskFailed
|
||||||
|
t.ErrMsg = j.err
|
||||||
|
} else {
|
||||||
|
t.Status = TaskDone
|
||||||
|
t.ErrMsg = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
q.finalizeTaskArtifactPathsLocked(t)
|
||||||
|
q.persistLocked()
|
||||||
|
q.mu.Unlock()
|
||||||
|
|
||||||
|
if err := writeTaskReportArtifacts(t); err != nil {
|
||||||
|
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||||
|
}
|
||||||
|
if t.ErrMsg != "" {
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
|
}
|
||||||
|
|
||||||
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
||||||
// Silently ignores errors (e.g. when cpufreq is not available).
|
// Silently ignores errors (e.g. when cpufreq is not available).
|
||||||
func setCPUGovernor(governor string) {
|
func setCPUGovernor(governor string) {
|
||||||
@@ -470,9 +553,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
diagLevel := t.params.DiagLevel
|
diagLevel := t.params.DiagLevel
|
||||||
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
|
||||||
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
|
||||||
}
|
|
||||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||||
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||||
@@ -485,6 +565,79 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
} else {
|
} else {
|
||||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
}
|
}
|
||||||
|
case "nvidia-targeted-stress":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if dur <= 0 {
|
||||||
|
dur = 300
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-benchmark":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||||
|
Profile: t.params.BenchmarkProfile,
|
||||||
|
SizeMB: t.params.SizeMB,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
RunNCCL: t.params.RunNCCL,
|
||||||
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
|
}, j.append)
|
||||||
|
case "nvidia-compute":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-targeted-power":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-pulse":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-bandwidth":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
|
||||||
|
case "nvidia-interconnect":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
|
DurationSec: dur,
|
||||||
|
Loader: platform.NvidiaStressLoaderNCCL,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
}, j.append)
|
||||||
case "nvidia-stress":
|
case "nvidia-stress":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
@@ -618,6 +771,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||||
|
if archive != "" {
|
||||||
|
archivePath := app.ExtractArchivePath(archive)
|
||||||
|
if err == nil {
|
||||||
|
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||||
|
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if db := q.statusDB(); db != nil {
|
||||||
|
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
j.append("Aborted.")
|
j.append("Aborted.")
|
||||||
@@ -634,6 +800,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
j.finish("")
|
j.finish("")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||||
|
if q.opts == nil || q.opts.App == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return q.opts.App.StatusDB
|
||||||
|
}
|
||||||
|
|
||||||
func splitLines(s string) []string {
|
func splitLines(s string) []string {
|
||||||
var out []string
|
var out []string
|
||||||
for _, l := range splitNL(s) {
|
for _, l := range splitNL(s) {
|
||||||
@@ -679,6 +852,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
globalQueue.persistLocked()
|
globalQueue.persistLocked()
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
@@ -688,6 +862,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
|||||||
now := time.Now()
|
now := time.Now()
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
globalQueue.persistLocked()
|
globalQueue.persistLocked()
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||||
default:
|
default:
|
||||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||||
@@ -728,6 +903,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
case TaskPending:
|
case TaskPending:
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
n++
|
n++
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
@@ -735,6 +911,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
|||||||
}
|
}
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -753,6 +930,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
|||||||
case TaskPending:
|
case TaskPending:
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
cancelled++
|
cancelled++
|
||||||
case TaskRunning:
|
case TaskRunning:
|
||||||
if t.job != nil {
|
if t.job != nil {
|
||||||
@@ -760,6 +938,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
|||||||
}
|
}
|
||||||
t.Status = TaskCancelled
|
t.Status = TaskCancelled
|
||||||
t.DoneAt = &now
|
t.DoneAt = &now
|
||||||
|
taskSerialEvent(t, "finished with status="+t.Status)
|
||||||
cancelled++
|
cancelled++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -823,10 +1002,10 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||||
if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
|
if q.logsDir == "" || t.ID == "" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
|
q.ensureTaskArtifactPathsLocked(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *taskQueue) loadLocked() {
|
func (q *taskQueue) loadLocked() {
|
||||||
@@ -843,17 +1022,20 @@ func (q *taskQueue) loadLocked() {
|
|||||||
}
|
}
|
||||||
for _, pt := range persisted {
|
for _, pt := range persisted {
|
||||||
t := &Task{
|
t := &Task{
|
||||||
ID: pt.ID,
|
ID: pt.ID,
|
||||||
Name: pt.Name,
|
Name: pt.Name,
|
||||||
Target: pt.Target,
|
Target: pt.Target,
|
||||||
Priority: pt.Priority,
|
Priority: pt.Priority,
|
||||||
Status: pt.Status,
|
Status: pt.Status,
|
||||||
CreatedAt: pt.CreatedAt,
|
CreatedAt: pt.CreatedAt,
|
||||||
StartedAt: pt.StartedAt,
|
StartedAt: pt.StartedAt,
|
||||||
DoneAt: pt.DoneAt,
|
DoneAt: pt.DoneAt,
|
||||||
ErrMsg: pt.ErrMsg,
|
ErrMsg: pt.ErrMsg,
|
||||||
LogPath: pt.LogPath,
|
LogPath: pt.LogPath,
|
||||||
params: pt.Params,
|
ArtifactsDir: pt.ArtifactsDir,
|
||||||
|
ReportJSONPath: pt.ReportJSONPath,
|
||||||
|
ReportHTMLPath: pt.ReportHTMLPath,
|
||||||
|
params: pt.Params,
|
||||||
}
|
}
|
||||||
q.assignTaskLogPathLocked(t)
|
q.assignTaskLogPathLocked(t)
|
||||||
if t.Status == TaskRunning {
|
if t.Status == TaskRunning {
|
||||||
@@ -884,17 +1066,20 @@ func (q *taskQueue) persistLocked() {
|
|||||||
state := make([]persistedTask, 0, len(q.tasks))
|
state := make([]persistedTask, 0, len(q.tasks))
|
||||||
for _, t := range q.tasks {
|
for _, t := range q.tasks {
|
||||||
state = append(state, persistedTask{
|
state = append(state, persistedTask{
|
||||||
ID: t.ID,
|
ID: t.ID,
|
||||||
Name: t.Name,
|
Name: t.Name,
|
||||||
Target: t.Target,
|
Target: t.Target,
|
||||||
Priority: t.Priority,
|
Priority: t.Priority,
|
||||||
Status: t.Status,
|
Status: t.Status,
|
||||||
CreatedAt: t.CreatedAt,
|
CreatedAt: t.CreatedAt,
|
||||||
StartedAt: t.StartedAt,
|
StartedAt: t.StartedAt,
|
||||||
DoneAt: t.DoneAt,
|
DoneAt: t.DoneAt,
|
||||||
ErrMsg: t.ErrMsg,
|
ErrMsg: t.ErrMsg,
|
||||||
LogPath: t.LogPath,
|
LogPath: t.LogPath,
|
||||||
Params: t.params,
|
ArtifactsDir: t.ArtifactsDir,
|
||||||
|
ReportJSONPath: t.ReportJSONPath,
|
||||||
|
ReportHTMLPath: t.ReportHTMLPath,
|
||||||
|
Params: t.params,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
data, err := json.MarshalIndent(state, "", " ")
|
data, err := json.MarshalIndent(state, "", " ")
|
||||||
@@ -925,3 +1110,113 @@ func taskElapsedSec(t *Task, now time.Time) int {
|
|||||||
}
|
}
|
||||||
return int(end.Sub(start).Round(time.Second) / time.Second)
|
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func taskFolderStatus(status string) string {
|
||||||
|
status = strings.TrimSpace(strings.ToLower(status))
|
||||||
|
switch status {
|
||||||
|
case TaskRunning, TaskDone, TaskFailed, TaskCancelled:
|
||||||
|
return status
|
||||||
|
default:
|
||||||
|
return TaskPending
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeTaskFolderPart(s string) string {
|
||||||
|
s = strings.TrimSpace(strings.ToLower(s))
|
||||||
|
if s == "" {
|
||||||
|
return "task"
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
lastDash := false
|
||||||
|
for _, r := range s {
|
||||||
|
isAlnum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
|
||||||
|
if isAlnum {
|
||||||
|
b.WriteRune(r)
|
||||||
|
lastDash = false
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !lastDash {
|
||||||
|
b.WriteByte('-')
|
||||||
|
lastDash = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out := strings.Trim(b.String(), "-")
|
||||||
|
if out == "" {
|
||||||
|
return "task"
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArtifactsDir(root string, t *Task, status string) string {
|
||||||
|
if strings.TrimSpace(root) == "" || t == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
prefix := taskFolderNumberPrefix(t.ID)
|
||||||
|
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskFolderNumberPrefix(taskID string) string {
|
||||||
|
taskID = strings.TrimSpace(taskID)
|
||||||
|
if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
|
||||||
|
num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
|
||||||
|
if len(num) == 3 {
|
||||||
|
allDigits := true
|
||||||
|
for _, r := range num {
|
||||||
|
if r < '0' || r > '9' {
|
||||||
|
allDigits = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if allDigits {
|
||||||
|
return num
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fallback := sanitizeTaskFolderPart(taskID)
|
||||||
|
if fallback == "" {
|
||||||
|
return "000"
|
||||||
|
}
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureTaskReportPaths(t *Task) {
|
||||||
|
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if t.LogPath == "" || filepath.Base(t.LogPath) == "task.log" {
|
||||||
|
t.LogPath = filepath.Join(t.ArtifactsDir, "task.log")
|
||||||
|
}
|
||||||
|
t.ReportJSONPath = filepath.Join(t.ArtifactsDir, "report.json")
|
||||||
|
t.ReportHTMLPath = filepath.Join(t.ArtifactsDir, "report.html")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) ensureTaskArtifactPathsLocked(t *Task) {
|
||||||
|
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||||
|
t.ArtifactsDir = taskArtifactsDir(q.logsDir, t, t.Status)
|
||||||
|
}
|
||||||
|
if t.ArtifactsDir != "" {
|
||||||
|
_ = os.MkdirAll(t.ArtifactsDir, 0755)
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (q *taskQueue) finalizeTaskArtifactPathsLocked(t *Task) {
|
||||||
|
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
q.ensureTaskArtifactPathsLocked(t)
|
||||||
|
dstDir := taskArtifactsDir(q.logsDir, t, t.Status)
|
||||||
|
if dstDir == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if t.ArtifactsDir != "" && t.ArtifactsDir != dstDir {
|
||||||
|
if _, err := os.Stat(dstDir); err != nil {
|
||||||
|
_ = os.Rename(t.ArtifactsDir, dstDir)
|
||||||
|
}
|
||||||
|
t.ArtifactsDir = dstDir
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(t)
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package webui
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
@@ -12,6 +13,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||||
@@ -161,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
origCounter := jobCounter.Load()
|
||||||
|
jobCounter.Store(0)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
jobCounter.Store(origCounter)
|
||||||
|
})
|
||||||
|
|
||||||
|
if got := newJobID("ignored"); got != "TASK-000" {
|
||||||
|
t.Fatalf("id=%q want TASK-000", got)
|
||||||
|
}
|
||||||
|
if got := newJobID("ignored"); got != "TASK-001" {
|
||||||
|
t.Fatalf("id=%q want TASK-001", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||||
|
root := t.TempDir()
|
||||||
|
task := &Task{
|
||||||
|
ID: "TASK-007",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
}
|
||||||
|
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||||
|
if !strings.HasPrefix(got, "007_") {
|
||||||
|
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
logPath := filepath.Join(dir, "task.log")
|
logPath := filepath.Join(dir, "task.log")
|
||||||
@@ -248,15 +284,205 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
|||||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
db, err := openMetricsDB(metricsPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
base := time.Now().UTC().Add(-45 * time.Second)
|
||||||
|
if err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base,
|
||||||
|
CPULoadPct: 42,
|
||||||
|
MemLoadPct: 35,
|
||||||
|
PowerW: 510,
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatalf("Write: %v", err)
|
||||||
|
}
|
||||||
|
_ = db.Close()
|
||||||
|
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
started := time.Now().UTC().Add(-90 * time.Second)
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: started.Add(-10 * time.Second),
|
||||||
|
StartedAt: &started,
|
||||||
|
}
|
||||||
|
q.assignTaskLogPathLocked(task)
|
||||||
|
appendJobLog(task.LogPath, "line-1")
|
||||||
|
|
||||||
|
job := newTaskJobState(task.LogPath)
|
||||||
|
job.finish("")
|
||||||
|
q.finalizeTaskRun(task, job)
|
||||||
|
|
||||||
|
if task.Status != TaskDone {
|
||||||
|
t.Fatalf("status=%q want %q", task.Status, TaskDone)
|
||||||
|
}
|
||||||
|
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
|
||||||
|
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(task.ReportJSONPath); err != nil {
|
||||||
|
t.Fatalf("report json: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
|
||||||
|
t.Fatalf("report html: %v", err)
|
||||||
|
}
|
||||||
|
var report taskReport
|
||||||
|
data, err := os.ReadFile(task.ReportJSONPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.json): %v", err)
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(data, &report); err != nil {
|
||||||
|
t.Fatalf("Unmarshal(report.json): %v", err)
|
||||||
|
}
|
||||||
|
if report.ID != task.ID || report.Status != TaskDone {
|
||||||
|
t.Fatalf("report=%+v", report)
|
||||||
|
}
|
||||||
|
if len(report.Charts) == 0 {
|
||||||
|
t.Fatalf("expected charts in report, got none")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||||
|
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||||
|
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-bench",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
Target: "nvidia-benchmark",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||||
|
ArtifactsDir: artifactsDir,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(task)
|
||||||
|
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||||
|
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.html): %v", err)
|
||||||
|
}
|
||||||
|
html := string(body)
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Benchmark Results`,
|
||||||
|
`Composite score for this benchmark task.`,
|
||||||
|
`NVIDIA H100 PCIe / GPU 0`,
|
||||||
|
`1176.25`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(html, needle) {
|
||||||
|
t.Fatalf("report missing %q: %s", needle, html)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||||
|
var lines []string
|
||||||
|
prev := taskSerialWriteLine
|
||||||
|
taskSerialWriteLine = func(line string) { lines = append(lines, line) }
|
||||||
|
t.Cleanup(func() { taskSerialWriteLine = prev })
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-serial-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
|
||||||
|
q.enqueue(task)
|
||||||
|
started := time.Now().UTC()
|
||||||
|
task.Status = TaskRunning
|
||||||
|
task.StartedAt = &started
|
||||||
|
job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||||
|
job.append("Starting CPU SAT...")
|
||||||
|
job.append("CPU stress duration: 60s")
|
||||||
|
job.finish("")
|
||||||
|
q.finalizeTaskRun(task, job)
|
||||||
|
|
||||||
|
joined := strings.Join(lines, "\n")
|
||||||
|
for _, needle := range []string{
|
||||||
|
"queued",
|
||||||
|
"Starting CPU SAT...",
|
||||||
|
"CPU stress duration: 60s",
|
||||||
|
"finished with status=done",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(joined, needle) {
|
||||||
|
t.Fatalf("serial mirror missing %q in %q", needle, joined)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestResolveBurnPreset(t *testing.T) {
|
func TestResolveBurnPreset(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
profile string
|
profile string
|
||||||
want burnPreset
|
want burnPreset
|
||||||
}{
|
}{
|
||||||
{profile: "smoke", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
{profile: "smoke", want: burnPreset{DurationSec: 5 * 60}},
|
||||||
{profile: "acceptance", want: burnPreset{NvidiaDiag: 3, DurationSec: 60 * 60}},
|
{profile: "acceptance", want: burnPreset{DurationSec: 60 * 60}},
|
||||||
{profile: "overnight", want: burnPreset{NvidiaDiag: 4, DurationSec: 8 * 60 * 60}},
|
{profile: "overnight", want: burnPreset{DurationSec: 8 * 60 * 60}},
|
||||||
{profile: "", want: burnPreset{NvidiaDiag: 1, DurationSec: 5 * 60}},
|
{profile: "", want: burnPreset{DurationSec: 5 * 60}},
|
||||||
}
|
}
|
||||||
for _, tc := range tests {
|
for _, tc := range tests {
|
||||||
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
if got := resolveBurnPreset(tc.profile); got != tc.want {
|
||||||
@@ -467,3 +693,52 @@ func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
|||||||
t.Fatalf("unexpected error: %q", j.err)
|
t.Fatalf("unexpected error: %q", j.err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{App: &app.App{}},
|
||||||
|
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
kmsgWatcher: newKmsgWatcher(nil),
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-panic-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
j := &jobState{}
|
||||||
|
|
||||||
|
orig := runCPUAcceptancePackCtx
|
||||||
|
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
|
panic("boom")
|
||||||
|
}
|
||||||
|
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||||
|
|
||||||
|
q.executeTask(tk, j, context.Background())
|
||||||
|
|
||||||
|
if tk.Status != TaskFailed {
|
||||||
|
t.Fatalf("status=%q want %q", tk.Status, TaskFailed)
|
||||||
|
}
|
||||||
|
if tk.DoneAt == nil {
|
||||||
|
t.Fatal("expected done_at to be set")
|
||||||
|
}
|
||||||
|
if !strings.Contains(tk.ErrMsg, "task panic: boom") {
|
||||||
|
t.Fatalf("task error=%q", tk.ErrMsg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(j.err, "task panic: boom") {
|
||||||
|
t.Fatalf("job error=%q", j.err)
|
||||||
|
}
|
||||||
|
q.kmsgWatcher.mu.Lock()
|
||||||
|
activeCount := q.kmsgWatcher.activeCount
|
||||||
|
window := q.kmsgWatcher.window
|
||||||
|
q.kmsgWatcher.mu.Unlock()
|
||||||
|
if activeCount != 0 {
|
||||||
|
t.Fatalf("activeCount=%d want 0", activeCount)
|
||||||
|
}
|
||||||
|
if window != nil {
|
||||||
|
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
16
audit/scripts/resolve-version.sh
Executable file
16
audit/scripts/resolve-version.sh
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
tag="$(git describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
|
|
||||||
|
case "${tag}" in
|
||||||
|
v*)
|
||||||
|
printf '%s\n' "${tag#v}"
|
||||||
|
;;
|
||||||
|
"")
|
||||||
|
printf 'dev\n'
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
printf '%s\n' "${tag}"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
# Benchmark clock calibration research
|
||||||
|
|
||||||
|
## Status
|
||||||
|
In progress. Baseline data from production servers pending.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
|
||||||
|
before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
|
||||||
|
`avg_steady_clock < locked_target * 0.90`.
|
||||||
|
|
||||||
|
Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
|
||||||
|
even a healthy GPU in a non-ideal server will sustain clocks well below boost.
|
||||||
|
The 90% threshold has no empirical basis.
|
||||||
|
|
||||||
|
## Key observations (2026-04-06)
|
||||||
|
|
||||||
|
### H100 PCIe — new card, server not designed for it
|
||||||
|
- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
|
||||||
|
- Stability: 70.0 — clocks erratic, no equilibrium found
|
||||||
|
- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
|
||||||
|
|
||||||
|
### H200 NVL — new card, server not designed for it
|
||||||
|
- avg clock = P95 = 1635 MHz (perfectly stable)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
|
||||||
|
- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
|
||||||
|
- Degradation: power_capped, thermal_limited
|
||||||
|
- Compute: 989 TOPS — card is computing correctly for its frequency
|
||||||
|
|
||||||
|
### Key insight
|
||||||
|
The meaningful distinction is not *whether* the card throttles but *how stably*
|
||||||
|
it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
|
||||||
|
H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
|
||||||
|
instability may reflect a more severe thermal mismatch or a card issue.
|
||||||
|
|
||||||
|
`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
|
||||||
|
`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
|
||||||
|
|
||||||
|
## Hypothesis for baseline
|
||||||
|
|
||||||
|
After testing on servers designed for their GPUs (proper cooling):
|
||||||
|
- Healthy GPU under sustained load will run at a stable fraction of boost
|
||||||
|
- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
|
||||||
|
- Base clock (`clocks.base.gr`) may be a better reference than boost:
|
||||||
|
a healthy card under real workload should comfortably exceed base clock
|
||||||
|
|
||||||
|
## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
|
||||||
|
|
||||||
|
Source: external stress test tool, ~90s runs, designed server, adequate power.
|
||||||
|
|
||||||
|
### Healthy fingerprint
|
||||||
|
|
||||||
|
- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
|
||||||
|
- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
|
||||||
|
- Avg steady (visual): **~1580–1620 MHz**
|
||||||
|
- vs boost 1755 MHz: **~91–92%**
|
||||||
|
- Oscillation is NORMAL — this is the boost algorithm balancing under power cap
|
||||||
|
- Stable power + oscillating clocks = healthy power-cap behavior
|
||||||
|
- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
|
||||||
|
- **Consistency**: all 10 samples within ±20 MHz — very repeatable
|
||||||
|
|
||||||
|
### Characteristic patten
|
||||||
|
Flat power line + oscillating/declining clock line = GPU correctly managed by
|
||||||
|
power cap algorithm. Do NOT flag this as instability.
|
||||||
|
|
||||||
|
### Clock CV implication
|
||||||
|
The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
|
||||||
|
The current `variance_too_high` threshold (StabilityScore < 85) may fire on
|
||||||
|
healthy HBM2e PCIe cards. Needs recalibration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
|
||||||
|
|
||||||
|
Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
|
||||||
|
Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
|
||||||
|
|
||||||
|
### GPU clock reference (from nvidia-smi, idle):
|
||||||
|
- base_clock_mhz: **1095**
|
||||||
|
- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
|
||||||
|
- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
|
||||||
|
- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
|
||||||
|
|
||||||
|
### Observed under 700W sustained load (both samples nearly identical):
|
||||||
|
- Power: ~700W flat — SXM slot, adequate power confirmed
|
||||||
|
- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
|
||||||
|
- vs 1980 MHz (lock target): **72–74%** — severely below
|
||||||
|
- vs 1755 MHz (nvidia-smi boost): **81–83%**
|
||||||
|
- vs 1095 MHz (base): 130% — above base but far below expected for SXM
|
||||||
|
- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
|
||||||
|
- Temperature: 38°C → 79–80°C (same rate as HBM2e)
|
||||||
|
- Oscillation: present, similar character to HBM2e but at much lower frequency
|
||||||
|
|
||||||
|
### Diagnosis
|
||||||
|
These restored cards are degraded. A healthy H100 SXM in a designed server
|
||||||
|
(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
|
||||||
|
The 72–74% result is a clear signal of silicon or VRM degradation from the
|
||||||
|
refurbishment process.
|
||||||
|
|
||||||
|
### Clock pattern note
|
||||||
|
Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
|
||||||
|
to images 19/20. Both sample sets show same degraded pattern — same batch.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline matrix (filled where data available)
|
||||||
|
|
||||||
|
| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
|
||||||
|
| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
|
||||||
|
| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
|
||||||
|
| H200 NVL | designed | TBD | TBD | TBD | need baseline |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## H100 official spec (from NVIDIA datasheet)
|
||||||
|
|
||||||
|
Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
|
||||||
|
All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
|
||||||
|
| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
|
||||||
|
| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
|
||||||
|
| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- SXM boards do NOT list FP8 peak in this table (field empty)
|
||||||
|
- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
|
||||||
|
- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
|
||||||
|
|
||||||
|
## Observed efficiency (H100 80GB PCIe, throttled server)
|
||||||
|
|
||||||
|
From the report in this session (power+thermal throttle throughout steady):
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
|
||||||
|
| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
|
||||||
|
| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
|
||||||
|
|
||||||
|
33–44% of spec is expected given sustained power+thermal throttle (avg clock
|
||||||
|
1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
|
||||||
|
actual frequency — the low TOPS comes from throttle, not silicon defect.
|
||||||
|
|
||||||
|
## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
|
||||||
|
|
||||||
|
Format: without sparsity / with sparsity.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
|
||||||
|
| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
|
||||||
|
|
||||||
|
## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
|
||||||
|
|
||||||
|
Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
|
||||||
|
| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
|
||||||
|
| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
|
||||||
|
|
||||||
|
Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
|
||||||
|
both are throttle-limited. Confirms that % of spec is not a quality signal,
|
||||||
|
it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
|
||||||
|
|
||||||
|
## Real-world GEMM efficiency reference (2026-04-06, web research)
|
||||||
|
|
||||||
|
Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
|
||||||
|
worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
|
||||||
|
|
||||||
|
### What healthy systems actually achieve:
|
||||||
|
- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
|
||||||
|
- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
|
||||||
|
- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
|
||||||
|
|
||||||
|
### Our results vs expectation:
|
||||||
|
| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
|
||||||
|
| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
|
||||||
|
|
||||||
|
Our results are roughly **half** of what a healthy system achieves even under throttle.
|
||||||
|
This is NOT normal — 30-44% is not the industry baseline.
|
||||||
|
|
||||||
|
### Likely causes of the gap (in order of probability):
|
||||||
|
1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
|
||||||
|
2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
|
||||||
|
Previous user may have set a lower limit via nvidia-smi -pl and it was not
|
||||||
|
reset. Our normalization sets clock locks but does NOT reset power limit.
|
||||||
|
Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
|
||||||
|
3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
|
||||||
|
8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
|
||||||
|
|
||||||
|
### Power limit gap analysis (H100 PCIe):
|
||||||
|
- Avg clock 1384 MHz = 79% of boost 1755 MHz
|
||||||
|
- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
|
||||||
|
- Actually measured: 329 TOPS = 55% of that estimate
|
||||||
|
- Remaining gap after accounting for clock throttle: ~45%
|
||||||
|
- Most likely explanation: enforced power limit < 350W TDP, further reducing
|
||||||
|
sustainable clock beyond what sw_thermal alone would cause.
|
||||||
|
|
||||||
|
### Action item:
|
||||||
|
Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
|
||||||
|
so result.json shows if the card was pre-configured with a non-default limit.
|
||||||
|
If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
|
||||||
|
|
||||||
|
### CPU/RAM impact on GPU FLOPS:
|
||||||
|
None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
|
||||||
|
CPU core count and host RAM are irrelevant.
|
||||||
|
|
||||||
|
## Compute efficiency metric (proposed, no hardcode)
|
||||||
|
|
||||||
|
Instead of comparing TOPS to a hardcoded spec, compute:
|
||||||
|
tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
|
||||||
|
|
||||||
|
This is model-agnostic. A GPU computing correctly at its actual frequency
|
||||||
|
will show a consistent tops_per_sm_per_ghz regardless of throttle level.
|
||||||
|
A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
|
||||||
|
normal clocks.
|
||||||
|
|
||||||
|
SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
|
||||||
|
(needs to be added to queryBenchmarkGPUInfo).
|
||||||
|
|
||||||
|
Reference values to establish after baseline runs:
|
||||||
|
- H100 PCIe fp16_tensor: TBD tops/SM/GHz
|
||||||
|
- H100 SXM fp16_tensor: TBD tops/SM/GHz
|
||||||
|
|
||||||
|
## Proposed threshold changes (pending more data)
|
||||||
|
|
||||||
|
1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
|
||||||
|
91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
|
||||||
|
capture the root cause.
|
||||||
|
|
||||||
|
2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
|
||||||
|
under power cap. Consider suppressing this flag when power is flat and usage
|
||||||
|
is 100% (oscillation is expected). Or lower threshold to 70.
|
||||||
|
|
||||||
|
3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
|
||||||
|
ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
|
||||||
|
would have been caught by this).
|
||||||
|
|
||||||
|
Decision deferred until baseline on SXM designed servers collected.
|
||||||
@@ -32,7 +32,7 @@ lb config noauto \
|
|||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -606,6 +606,20 @@ struct prepared_profile {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const struct profile_desc k_profiles[] = {
|
static const struct profile_desc k_profiles[] = {
|
||||||
|
{
|
||||||
|
"fp64",
|
||||||
|
"fp64",
|
||||||
|
80,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
8,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUBLAS_COMPUTE_64F,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"fp32_tf32",
|
"fp32_tf32",
|
||||||
"fp32",
|
"fp32",
|
||||||
|
|||||||
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
|
|||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$VARIANT" in
|
case "$VARIANT" in
|
||||||
nvidia|amd|nogpu|all) ;;
|
nvidia|nvidia-legacy|amd|nogpu|all) ;;
|
||||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
*) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dirs ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -180,6 +185,9 @@ case "$VARIANT" in
|
|||||||
nvidia)
|
nvidia)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
;;
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
run_variant nvidia-legacy
|
||||||
|
;;
|
||||||
amd)
|
amd)
|
||||||
run_variant amd
|
run_variant amd
|
||||||
;;
|
;;
|
||||||
@@ -188,6 +196,7 @@ case "$VARIANT" in
|
|||||||
;;
|
;;
|
||||||
all)
|
all)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
|
run_variant nvidia-legacy
|
||||||
run_variant amd
|
run_variant amd
|
||||||
run_variant nogpu
|
run_variant nogpu
|
||||||
;;
|
;;
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
|
||||||
#
|
#
|
||||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
|
||||||
|
# - open -> kernel-open/ sources from the .run installer
|
||||||
|
# - proprietary -> traditional proprietary kernel sources from the .run installer
|
||||||
#
|
#
|
||||||
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
||||||
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
||||||
@@ -17,10 +19,19 @@ set -e
|
|||||||
NVIDIA_VERSION="$1"
|
NVIDIA_VERSION="$1"
|
||||||
DIST_DIR="$2"
|
DIST_DIR="$2"
|
||||||
DEBIAN_KERNEL_ABI="$3"
|
DEBIAN_KERNEL_ABI="$3"
|
||||||
|
NVIDIA_FLAVOR="${4:-open}"
|
||||||
|
|
||||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
|
|
||||||
|
case "$NVIDIA_FLAVOR" in
|
||||||
|
open|proprietary) ;;
|
||||||
|
*)
|
||||||
|
echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
# On Debian, kernel headers are split into two packages:
|
# On Debian, kernel headers are split into two packages:
|
||||||
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
|||||||
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||||
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||||
|
|
||||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
|
||||||
|
|
||||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
|
||||||
echo "=== installing linux-headers-${KVER} ==="
|
|
||||||
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
|
||||||
"linux-headers-${KVER}" \
|
|
||||||
gcc make perl
|
|
||||||
fi
|
|
||||||
echo "kernel headers (arch): $KDIR_ARCH"
|
|
||||||
echo "kernel headers (common): $KDIR_COMMON"
|
|
||||||
|
|
||||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||||
CACHE_LAYOUT_VERSION="2"
|
CACHE_LAYOUT_VERSION="3"
|
||||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||||
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||||
|
echo "=== installing linux-headers-${KVER} ==="
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
"linux-headers-${KVER}" \
|
||||||
|
gcc make perl
|
||||||
|
fi
|
||||||
|
echo "kernel headers (arch): $KDIR_ARCH"
|
||||||
|
echo "kernel headers (common): $KDIR_COMMON"
|
||||||
|
|
||||||
# Download official NVIDIA .run installer with sha256 verification
|
# Download official NVIDIA .run installer with sha256 verification
|
||||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||||
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||||
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
|||||||
rm -rf "$EXTRACT_DIR"
|
rm -rf "$EXTRACT_DIR"
|
||||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||||
|
|
||||||
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
# Find kernel source directory for the selected flavor.
|
||||||
KERNEL_SRC=""
|
KERNEL_SRC=""
|
||||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
if [ "$NVIDIA_FLAVOR" = "open" ]; then
|
||||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
|
||||||
done
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
done
|
||||||
|
else
|
||||||
|
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||||
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||||
echo "kernel source: $KERNEL_SRC"
|
echo "kernel source: $KERNEL_SRC"
|
||||||
|
|
||||||
# Build kernel modules
|
# Build kernel modules
|
||||||
|
|||||||
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
|
|||||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
|
BUILD_VARIANT="nvidia"
|
||||||
BEE_GPU_VENDOR="nvidia"
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
--variant) BUILD_VARIANT="$2"; shift 2 ;;
|
||||||
*) echo "unknown arg: $1"; exit 1 ;;
|
*) echo "unknown arg: $1"; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$BEE_GPU_VENDOR" in
|
case "$BUILD_VARIANT" in
|
||||||
nvidia|amd|nogpu) ;;
|
nvidia)
|
||||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="proprietary"
|
||||||
|
;;
|
||||||
|
amd)
|
||||||
|
BEE_GPU_VENDOR="amd"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
nogpu)
|
||||||
|
BEE_GPU_VENDOR="nogpu"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||||
|
|
||||||
export BEE_GPU_VENDOR
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
@@ -54,15 +74,8 @@ resolve_audit_version() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'audit/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
if [ -z "${tag}" ]; then
|
|
||||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
|
||||||
fi
|
|
||||||
case "${tag}" in
|
case "${tag}" in
|
||||||
audit/v*)
|
|
||||||
echo "${tag#audit/v}"
|
|
||||||
return 0
|
|
||||||
;;
|
|
||||||
v*)
|
v*)
|
||||||
echo "${tag#v}"
|
echo "${tag#v}"
|
||||||
return 0
|
return 0
|
||||||
@@ -309,6 +322,12 @@ memtest_fail() {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nvidia_runtime_fail() {
|
||||||
|
msg="$1"
|
||||||
|
echo "ERROR: ${msg}" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
iso_memtest_present() {
|
iso_memtest_present() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
iso_files="$(mktemp)"
|
iso_files="$(mktemp)"
|
||||||
@@ -446,6 +465,44 @@ validate_iso_memtest() {
|
|||||||
echo "=== memtest validation OK ==="
|
echo "=== memtest validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validate_iso_nvidia_runtime() {
|
||||||
|
iso_path="$1"
|
||||||
|
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||||
|
|
||||||
|
echo "=== validating NVIDIA runtime in ISO ==="
|
||||||
|
|
||||||
|
[ -f "$iso_path" ] || nvidia_runtime_fail "ISO not found for NVIDIA runtime validation: $iso_path"
|
||||||
|
require_iso_reader "$iso_path" >/dev/null 2>&1 || nvidia_runtime_fail "ISO reader unavailable for NVIDIA runtime validation"
|
||||||
|
command -v unsquashfs >/dev/null 2>&1 || nvidia_runtime_fail "unsquashfs is required for NVIDIA runtime validation"
|
||||||
|
|
||||||
|
squashfs_tmp="$(mktemp)"
|
||||||
|
squashfs_list="$(mktemp)"
|
||||||
|
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
|
||||||
|
}
|
||||||
|
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
|
||||||
|
}
|
||||||
|
|
||||||
|
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
||||||
|
}
|
||||||
|
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
||||||
|
}
|
||||||
|
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list"
|
||||||
|
echo "=== NVIDIA runtime validation OK ==="
|
||||||
|
}
|
||||||
|
|
||||||
append_memtest_grub_entry() {
|
append_memtest_grub_entry() {
|
||||||
grub_cfg="$1"
|
grub_cfg="$1"
|
||||||
[ -f "$grub_cfg" ] || return 1
|
[ -f "$grub_cfg" ] || return 1
|
||||||
@@ -590,7 +647,7 @@ recover_iso_memtest() {
|
|||||||
|
|
||||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||||
mkdir -p "${OUT_DIR}"
|
mkdir -p "${OUT_DIR}"
|
||||||
@@ -764,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
|||||||
apt-get install -y "linux-headers-${KVER}"
|
apt-get install -y "linux-headers-${KVER}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -834,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
# Sync builder config into variant work dir, preserving lb cache.
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
@@ -860,6 +917,86 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
|||||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "$BEE_GPU_VENDOR" != "nvidia" ] || [ "$BEE_NVIDIA_MODULE_FLAVOR" != "proprietary" ]; then
|
||||||
|
cat > "${BUILD_WORK_DIR}/config/bootloaders/grub-pc/grub.cfg" <<'EOF'
|
||||||
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
||||||
|
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
||||||
|
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
||||||
|
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||||
|
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||||
|
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||||
|
echo " Hardware Audit LiveCD"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
menuentry "EASY-BEE" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
|
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
chainloader /boot/memtest86+x64.efi
|
||||||
|
}
|
||||||
|
else
|
||||||
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
linux16 /boot/memtest86+x64.bin
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
menuentry "UEFI Firmware Settings" {
|
||||||
|
fwsetup
|
||||||
|
}
|
||||||
|
fi
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > "${BUILD_WORK_DIR}/config/bootloaders/isolinux/live.cfg.in" <<'EOF'
|
||||||
|
label live-@FLAVOUR@-normal
|
||||||
|
menu label ^EASY-BEE
|
||||||
|
menu default
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-kms
|
||||||
|
menu label EASY-BEE (^graphics/KMS)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ bee.display=kms
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-toram
|
||||||
|
menu label EASY-BEE (^load to RAM)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ toram
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-failsafe
|
||||||
|
menu label EASY-BEE (^fail-safe)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||||
|
|
||||||
|
label memtest
|
||||||
|
menu label ^Memory Test (memtest86+)
|
||||||
|
linux /boot/memtest86+x64.bin
|
||||||
|
EOF
|
||||||
|
fi
|
||||||
|
|
||||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||||
rm -f \
|
rm -f \
|
||||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||||
@@ -944,10 +1081,10 @@ done
|
|||||||
# --- NVIDIA kernel modules and userspace libs ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
@@ -1018,13 +1155,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
|
|||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||||
|
NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
|
||||||
NCCL_VERSION=${NCCL_VERSION}
|
NCCL_VERSION=${NCCL_VERSION}
|
||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||||
@@ -1036,6 +1174,7 @@ fi
|
|||||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||||
|
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||||
BUILD_DATE=${BUILD_DATE}
|
BUILD_DATE=${BUILD_DATE}
|
||||||
GIT_COMMIT=${GIT_COMMIT}
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
@@ -1046,6 +1185,11 @@ EOF
|
|||||||
|
|
||||||
# Write GPU vendor marker for hooks
|
# Write GPU vendor marker for hooks
|
||||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||||
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
else
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
fi
|
||||||
|
|
||||||
# Patch motd with build info
|
# Patch motd with build info
|
||||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||||
@@ -1116,10 +1260,10 @@ fi
|
|||||||
|
|
||||||
# --- build ISO using live-build ---
|
# --- build ISO using live-build ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||||
|
|
||||||
# Export for auto/config
|
# Export for auto/config
|
||||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||||
export BEE_GPU_VENDOR_UPPER
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
@@ -1151,9 +1295,10 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
echo "=== done (${BUILD_VARIANT}) ==="
|
||||||
echo "ISO: $ISO_OUT"
|
echo "ISO: $ISO_OUT"
|
||||||
if command -v stat >/dev/null 2>&1; then
|
if command -v stat >/dev/null 2>&1; then
|
||||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ echo " █████╗ ███████║███████╗ ╚
|
|||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||||
|
echo " Hardware Audit LiveCD"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
@@ -14,29 +15,21 @@ menuentry "EASY-BEE" {
|
|||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS)" {
|
submenu "EASY-BEE (advanced options) -->" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
menuentry "EASY-BEE — GSP=off" {
|
||||||
initrd @INITRD_LIVE@
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
}
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
menuentry "EASY-BEE — fail-safe" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
|
||||||
|
|
||||||
menuentry "EASY-BEE (fail-safe)" {
|
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
|
||||||
initrd @INITRD_LIVE@
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
|
|||||||
@@ -30,6 +30,8 @@ systemctl enable bee-preflight.service
|
|||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
systemctl enable bee-sshsetup.service
|
systemctl enable bee-sshsetup.service
|
||||||
|
systemctl enable bee-selfheal.timer
|
||||||
|
systemctl enable bee-boot-status.service
|
||||||
systemctl enable ssh.service
|
systemctl enable ssh.service
|
||||||
systemctl enable lightdm.service 2>/dev/null || true
|
systemctl enable lightdm.service 2>/dev/null || true
|
||||||
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||||
@@ -58,6 +60,8 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
117
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
117
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
@@ -0,0 +1,117 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||||
|
set -e
|
||||||
|
echo "=== generating bee wallpaper ==="
|
||||||
|
mkdir -p /usr/share/bee
|
||||||
|
|
||||||
|
python3 - <<'PYEOF'
|
||||||
|
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||||
|
import os
|
||||||
|
|
||||||
|
W, H = 1920, 1080
|
||||||
|
|
||||||
|
GLYPHS = {
|
||||||
|
'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
|
||||||
|
'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
|
||||||
|
'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
|
||||||
|
'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
|
||||||
|
'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
|
||||||
|
'-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
|
||||||
|
}
|
||||||
|
|
||||||
|
TITLE = "EASY-BEE"
|
||||||
|
SUBTITLE = "Hardware Audit LiveCD"
|
||||||
|
CELL = 30
|
||||||
|
GLYPH_GAP = 18
|
||||||
|
ROW_GAP = 6
|
||||||
|
|
||||||
|
FG = (0xF6, 0xD0, 0x47)
|
||||||
|
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||||
|
SHADOW = (0x5E, 0x47, 0x05)
|
||||||
|
SUB = (0x96, 0x7A, 0x17)
|
||||||
|
BG = (0x05, 0x05, 0x05)
|
||||||
|
|
||||||
|
SUB_FONT_CANDIDATES = [
|
||||||
|
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf',
|
||||||
|
'/usr/share/fonts/truetype/freefont/FreeSansBold.ttf',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def load_font(size):
|
||||||
|
for path in SUB_FONT_CANDIDATES:
|
||||||
|
if os.path.exists(path):
|
||||||
|
return ImageFont.truetype(path, size)
|
||||||
|
return ImageFont.load_default()
|
||||||
|
|
||||||
|
|
||||||
|
def glyph_width(ch):
|
||||||
|
return len(GLYPHS[ch][0])
|
||||||
|
|
||||||
|
|
||||||
|
def render_logo_mask():
|
||||||
|
width_cells = 0
|
||||||
|
for idx, ch in enumerate(TITLE):
|
||||||
|
width_cells += glyph_width(ch)
|
||||||
|
if idx != len(TITLE) - 1:
|
||||||
|
width_cells += 1
|
||||||
|
mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
|
||||||
|
mask_h = 7 * CELL + 6 * ROW_GAP
|
||||||
|
mask = Image.new('L', (mask_w, mask_h), 0)
|
||||||
|
draw = ImageDraw.Draw(mask)
|
||||||
|
|
||||||
|
cx = 0
|
||||||
|
for idx, ch in enumerate(TITLE):
|
||||||
|
glyph = GLYPHS[ch]
|
||||||
|
for row_idx, row in enumerate(glyph):
|
||||||
|
for col_idx, cell in enumerate(row):
|
||||||
|
if cell != '1':
|
||||||
|
continue
|
||||||
|
x0 = cx + col_idx * CELL
|
||||||
|
y0 = row_idx * (CELL + ROW_GAP)
|
||||||
|
x1 = x0 + CELL - 4
|
||||||
|
y1 = y0 + CELL - 4
|
||||||
|
draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
|
||||||
|
cx += glyph_width(ch) * CELL
|
||||||
|
if idx != len(TITLE) - 1:
|
||||||
|
cx += CELL + GLYPH_GAP
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
img = Image.new('RGB', (W, H), BG)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
|
||||||
|
# Soft amber glow under the logo without depending on font rendering.
|
||||||
|
glow = Image.new('RGBA', (W, H), (0, 0, 0, 0))
|
||||||
|
glow_draw = ImageDraw.Draw(glow)
|
||||||
|
glow_draw.ellipse((360, 250, 1560, 840), fill=(180, 120, 10, 56))
|
||||||
|
glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||||
|
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||||
|
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||||
|
|
||||||
|
logo_mask = render_logo_mask()
|
||||||
|
logo_w, logo_h = logo_mask.size
|
||||||
|
logo_x = (W - logo_w) // 2
|
||||||
|
logo_y = 290
|
||||||
|
|
||||||
|
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
|
||||||
|
img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
|
||||||
|
img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
|
||||||
|
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||||
|
|
||||||
|
font_sub = load_font(30)
|
||||||
|
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||||
|
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||||
|
sub_y = logo_y + logo_h + 54
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||||
|
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||||
|
|
||||||
|
img = img.convert('RGB')
|
||||||
|
|
||||||
|
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||||
|
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||||
|
PYEOF
|
||||||
|
|
||||||
|
echo "=== wallpaper done ==="
|
||||||
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
41
iso/builder/config/hooks/normal/9010-fix-toram.hook.chroot
Executable file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9010-fix-toram.hook.chroot — patch live-boot toram to work with tmpfs (no O_DIRECT)
|
||||||
|
#
|
||||||
|
# live-boot tries "losetup --replace --direct-io=on" when re-associating the
|
||||||
|
# loop device to the RAM copy in /dev/shm. tmpfs does not support O_DIRECT,
|
||||||
|
# so the ioctl returns EINVAL and the verification step fails.
|
||||||
|
#
|
||||||
|
# The patch replaces the replace call so that if --direct-io=on fails it falls
|
||||||
|
# back to a plain replace without direct-io, and also relaxes the verification
|
||||||
|
# to a warning so the boot continues even when re-association is imperfect.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
TORAM_SCRIPT="/usr/lib/live/boot/9990-toram-todisk.sh"
|
||||||
|
|
||||||
|
if [ ! -f "${TORAM_SCRIPT}" ]; then
|
||||||
|
echo "9010-fix-toram: ${TORAM_SCRIPT} not found, skipping"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "9010-fix-toram: patching ${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
# Replace any losetup --replace call that includes --direct-io=on with a
|
||||||
|
# version that first tries with direct-io, then retries without it.
|
||||||
|
#
|
||||||
|
# The sed expression turns:
|
||||||
|
# losetup --replace ... --direct-io=on LOOP FILE
|
||||||
|
# into a shell snippet that tries both, silently.
|
||||||
|
#
|
||||||
|
# We also downgrade the fatal "Task finished with error." block to a warning
|
||||||
|
# so the boot continues if re-association fails (squashfs still accessible).
|
||||||
|
|
||||||
|
# 1. Strip --direct-io=on from the losetup --replace call so it works on tmpfs.
|
||||||
|
sed -i 's/losetup --replace --direct-io=on/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||||
|
sed -i 's/losetup --replace --direct-io/losetup --replace/g' "${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
# 2. Turn the hard error into a warning so boot continues.
|
||||||
|
# live-boot prints this exact string when verification fails.
|
||||||
|
sed -i 's/echo "Task finished with error\."/echo "Warning: toram re-association failed, continuing boot (squashfs still in RAM)"/' "${TORAM_SCRIPT}"
|
||||||
|
|
||||||
|
echo "9010-fix-toram: patch applied"
|
||||||
|
grep -n "losetup" "${TORAM_SCRIPT}" | head -20 || true
|
||||||
@@ -1,6 +1,10 @@
|
|||||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
# NVIDIA DCGM (Data Center GPU Manager).
|
||||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
# Validate uses dcgmi diagnostics; Burn uses dcgmproftester as the official
|
||||||
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
# NVIDIA max-compute recipe. The smoketest/runtime contract treats
|
||||||
|
# dcgmproftester as required in the LiveCD.
|
||||||
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||||
|
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||||
|
# explicitly.
|
||||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
|||||||
@@ -60,9 +60,15 @@ qrencode
|
|||||||
# Local desktop (openbox + chromium kiosk)
|
# Local desktop (openbox + chromium kiosk)
|
||||||
openbox
|
openbox
|
||||||
tint2
|
tint2
|
||||||
|
feh
|
||||||
|
python3-pil
|
||||||
xorg
|
xorg
|
||||||
xterm
|
xterm
|
||||||
chromium
|
chromium
|
||||||
|
mousepad
|
||||||
|
pcmanfm
|
||||||
|
ristretto
|
||||||
|
mupdf
|
||||||
xserver-xorg-video-fbdev
|
xserver-xorg-video-fbdev
|
||||||
xserver-xorg-video-vesa
|
xserver-xorg-video-vesa
|
||||||
lightdm
|
lightdm
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ echo ""
|
|||||||
KVER=$(uname -r)
|
KVER=$(uname -r)
|
||||||
info "kernel: $KVER"
|
info "kernel: $KVER"
|
||||||
NVIDIA_BOOT_MODE="normal"
|
NVIDIA_BOOT_MODE="normal"
|
||||||
|
NVIDIA_MODULES_FLAVOR="proprietary"
|
||||||
for arg in $(cat /proc/cmdline 2>/dev/null); do
|
for arg in $(cat /proc/cmdline 2>/dev/null); do
|
||||||
case "$arg" in
|
case "$arg" in
|
||||||
bee.nvidia.mode=*)
|
bee.nvidia.mode=*)
|
||||||
@@ -34,7 +35,11 @@ for arg in $(cat /proc/cmdline 2>/dev/null); do
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
if [ -f /etc/bee-nvidia-modules-flavor ]; then
|
||||||
|
NVIDIA_MODULES_FLAVOR="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null || echo proprietary)"
|
||||||
|
fi
|
||||||
info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
||||||
|
info "nvidia modules flavor: ${NVIDIA_MODULES_FLAVOR}"
|
||||||
|
|
||||||
# --- PATH & binaries ---
|
# --- PATH & binaries ---
|
||||||
echo "-- PATH & binaries --"
|
echo "-- PATH & binaries --"
|
||||||
@@ -52,6 +57,31 @@ else
|
|||||||
fail "nvidia-smi: NOT FOUND"
|
fail "nvidia-smi: NOT FOUND"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if p=$(PATH="/usr/local/bin:$PATH" command -v dcgmi 2>/dev/null); then
|
||||||
|
ok "dcgmi found: $p"
|
||||||
|
else
|
||||||
|
fail "dcgmi: NOT FOUND"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if p=$(PATH="/usr/local/bin:$PATH" command -v nv-hostengine 2>/dev/null); then
|
||||||
|
ok "nv-hostengine found: $p"
|
||||||
|
else
|
||||||
|
fail "nv-hostengine: NOT FOUND"
|
||||||
|
fi
|
||||||
|
|
||||||
|
DCGM_PROFTESTER=""
|
||||||
|
for tool in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||||
|
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
|
DCGM_PROFTESTER="$p"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ -n "$DCGM_PROFTESTER" ]; then
|
||||||
|
ok "dcgmproftester found: $DCGM_PROFTESTER"
|
||||||
|
else
|
||||||
|
fail "dcgmproftester: NOT FOUND"
|
||||||
|
fi
|
||||||
|
|
||||||
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
ok "$tool found: $p"
|
ok "$tool found: $p"
|
||||||
@@ -60,6 +90,12 @@ for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
if p=$(PATH="/usr/local/bin:$PATH" command -v nvbandwidth 2>/dev/null); then
|
||||||
|
ok "nvbandwidth found: $p"
|
||||||
|
else
|
||||||
|
warn "nvbandwidth: NOT FOUND"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- NVIDIA modules --"
|
echo "-- NVIDIA modules --"
|
||||||
KO_DIR="/usr/local/lib/nvidia"
|
KO_DIR="/usr/local/lib/nvidia"
|
||||||
@@ -79,10 +115,12 @@ fi
|
|||||||
for mod in nvidia_modeset nvidia_uvm; do
|
for mod in nvidia_modeset nvidia_uvm; do
|
||||||
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
|
if /sbin/lsmod 2>/dev/null | grep -q "^$mod "; then
|
||||||
ok "module loaded: $mod"
|
ok "module loaded: $mod"
|
||||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
|
||||||
fail "module NOT loaded in normal mode: $mod"
|
fail "module NOT loaded in normal mode: $mod"
|
||||||
else
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
|
||||||
warn "module not loaded in GSP-off mode: $mod"
|
warn "module not loaded in GSP-off mode: $mod"
|
||||||
|
else
|
||||||
|
fail "module NOT loaded: $mod"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
@@ -98,10 +136,12 @@ done
|
|||||||
|
|
||||||
if [ -e /dev/nvidia-uvm ]; then
|
if [ -e /dev/nvidia-uvm ]; then
|
||||||
ok "/dev/nvidia-uvm exists"
|
ok "/dev/nvidia-uvm exists"
|
||||||
elif [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; then
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ] && { [ "${NVIDIA_BOOT_MODE}" = "normal" ] || [ "${NVIDIA_BOOT_MODE}" = "full" ]; }; then
|
||||||
fail "/dev/nvidia-uvm missing in normal mode"
|
fail "/dev/nvidia-uvm missing in normal mode"
|
||||||
else
|
elif [ "${NVIDIA_MODULES_FLAVOR}" = "proprietary" ]; then
|
||||||
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
|
warn "/dev/nvidia-uvm missing — CUDA stress path may be unavailable until loaded on demand"
|
||||||
|
else
|
||||||
|
fail "/dev/nvidia-uvm missing"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
@@ -171,6 +211,12 @@ for svc in bee-nvidia bee-network bee-preflight bee-audit bee-web; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
if systemctl is-active --quiet bee-selfheal.timer 2>/dev/null; then
|
||||||
|
ok "timer active: bee-selfheal.timer"
|
||||||
|
else
|
||||||
|
fail "timer NOT active: bee-selfheal.timer"
|
||||||
|
fi
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- runtime health --"
|
echo "-- runtime health --"
|
||||||
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
|
if [ -f /appdata/bee/export/runtime-health.json ] && [ -s /appdata/bee/export/runtime-health.json ]; then
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit
|
Description=Bee: hardware audit
|
||||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||||
Before=bee-web.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: boot status display
|
||||||
|
After=systemd-user-sessions.service
|
||||||
|
Before=getty@tty1.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=no
|
||||||
|
ExecStart=/usr/local/bin/bee-boot-status
|
||||||
|
TTYPath=/dev/tty1
|
||||||
|
StandardInput=tty
|
||||||
|
StandardOutput=tty
|
||||||
|
StandardError=tty
|
||||||
|
TTYReset=yes
|
||||||
|
TTYVHangup=yes
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
9
iso/overlay/etc/systemd/system/bee-selfheal.service
Normal file
9
iso/overlay/etc/systemd/system/bee-selfheal.service
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: periodic runtime self-heal
|
||||||
|
After=bee-web.service bee-audit.service bee-preflight.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-selfheal.log /usr/local/bin/bee-selfheal
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
11
iso/overlay/etc/systemd/system/bee-selfheal.timer
Normal file
11
iso/overlay/etc/systemd/system/bee-selfheal.timer
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Bee: run self-heal checks periodically
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnBootSec=45sec
|
||||||
|
OnUnitActiveSec=60sec
|
||||||
|
AccuracySec=15sec
|
||||||
|
Unit=bee-selfheal.service
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
After=bee-audit.service
|
StartLimitIntervalSec=0
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
|
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-web.log /usr/local/bin/bee web --listen :80 --audit-path /appdata/bee/export/bee-audit.json --export-dir /appdata/bee/export --title "Bee Hardware Audit"
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=2
|
RestartSec=3
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
LimitMEMLOCK=infinity
|
LimitMEMLOCK=infinity
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
[Unit]
|
||||||
|
After=bee-boot-status.service
|
||||||
@@ -1,6 +1,4 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Wants=bee-preflight.service
|
|
||||||
After=bee-preflight.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
ExecStartPre=/usr/local/bin/bee-display-mode
|
ExecStartPre=/usr/local/bin/bee-display-mode
|
||||||
|
|||||||
89
iso/overlay/usr/local/bin/bee-boot-status
Normal file
89
iso/overlay/usr/local/bin/bee-boot-status
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-boot-status — boot progress display on tty1.
|
||||||
|
# Shows live service status until all bee services are done or failed,
|
||||||
|
# then exits so getty can show the login prompt.
|
||||||
|
|
||||||
|
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||||
|
|
||||||
|
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
||||||
|
|
||||||
|
svc_icon() {
|
||||||
|
case "$(svc_state "$1")" in
|
||||||
|
active) printf '\033[32m[ OK ]\033[0m' ;;
|
||||||
|
failed) printf '\033[31m[ FAIL ]\033[0m' ;;
|
||||||
|
activating) printf '\033[33m[ .. ]\033[0m' ;;
|
||||||
|
deactivating) printf '\033[33m[ stop ]\033[0m' ;;
|
||||||
|
inactive) printf '\033[90m[ ]\033[0m' ;;
|
||||||
|
*) printf '\033[90m[ ? ]\033[0m' ;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
svc_detail() {
|
||||||
|
local svc="$1" state
|
||||||
|
state="$(svc_state "$svc")"
|
||||||
|
case "$state" in
|
||||||
|
failed)
|
||||||
|
local res
|
||||||
|
res="$(systemctl show -p Result "$svc.service" 2>/dev/null | cut -d= -f2)"
|
||||||
|
[ -n "$res" ] && [ "$res" != "success" ] && printf ' \033[31m(%s)\033[0m' "$res"
|
||||||
|
;;
|
||||||
|
activating)
|
||||||
|
local line
|
||||||
|
line="$(journalctl -u "$svc.service" -n 1 --no-pager --output=cat 2>/dev/null | cut -c1-55)"
|
||||||
|
[ -n "$line" ] && printf ' \033[90m%s\033[0m' "$line"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
all_critical_done() {
|
||||||
|
for svc in $CRITICAL; do
|
||||||
|
case "$(svc_state "$svc")" in
|
||||||
|
active|failed|inactive) ;;
|
||||||
|
*) return 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
# move to top-left and clear screen
|
||||||
|
printf '\033[H\033[2J'
|
||||||
|
|
||||||
|
printf '\n'
|
||||||
|
printf ' \033[33m███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗\033[0m\n'
|
||||||
|
printf ' \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝\033[0m\n'
|
||||||
|
printf ' \033[33m█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗\033[0m\n'
|
||||||
|
printf ' \033[33m██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝\033[0m\n'
|
||||||
|
printf ' \033[33m███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗\033[0m\n'
|
||||||
|
printf ' \033[33m╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
|
||||||
|
printf ' Hardware Audit LiveCD\n'
|
||||||
|
printf '\n'
|
||||||
|
|
||||||
|
for svc in $ALL; do
|
||||||
|
printf ' %s %-20s%s\n' "$(svc_icon "$svc")" "$svc" "$(svc_detail "$svc")"
|
||||||
|
done
|
||||||
|
printf '\n'
|
||||||
|
|
||||||
|
# Network
|
||||||
|
ips="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{printf " %-16s %s\n", $NF, $2}')"
|
||||||
|
if [ -n "$ips" ]; then
|
||||||
|
printf ' \033[1mNetwork:\033[0m\n'
|
||||||
|
printf '%s\n' "$ips"
|
||||||
|
printf '\n'
|
||||||
|
fi
|
||||||
|
|
||||||
|
if all_critical_done; then
|
||||||
|
printf ' \033[1;32mSystem ready.\033[0m Audit is running in the background.\n'
|
||||||
|
first_ip="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -1)"
|
||||||
|
if [ -n "$first_ip" ]; then
|
||||||
|
printf ' Web UI: \033[1mhttp://%s/\033[0m\n' "$first_ip"
|
||||||
|
fi
|
||||||
|
printf '\n'
|
||||||
|
sleep 3
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf ' \033[90mStarting up...\033[0m\n'
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
@@ -62,6 +62,8 @@ done
|
|||||||
echo "loader=bee-gpu-burn"
|
echo "loader=bee-gpu-burn"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
TMP_DIR=$(mktemp -d)
|
TMP_DIR=$(mktemp -d)
|
||||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
|
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
SECONDS=300
|
DURATION_SEC=300
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
FORMAT=""
|
FORMAT=""
|
||||||
|
TEST_SLICE_SECONDS=300
|
||||||
JOHN_DIR="/usr/local/lib/bee/john/run"
|
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||||
JOHN_BIN="${JOHN_DIR}/john"
|
JOHN_BIN="${JOHN_DIR}/john"
|
||||||
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||||
@@ -116,7 +117,7 @@ ensure_opencl_ready() {
|
|||||||
|
|
||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
@@ -151,14 +152,19 @@ done
|
|||||||
|
|
||||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
export CUDA_VISIBLE_DEVICES="${FINAL}"
|
||||||
|
|
||||||
JOHN_DEVICES=""
|
JOHN_DEVICES=""
|
||||||
|
local_id=1
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
opencl_id=$((id + 1))
|
opencl_id="${local_id}"
|
||||||
if [ -z "${JOHN_DEVICES}" ]; then
|
if [ -z "${JOHN_DEVICES}" ]; then
|
||||||
JOHN_DEVICES="${opencl_id}"
|
JOHN_DEVICES="${opencl_id}"
|
||||||
else
|
else
|
||||||
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||||
fi
|
fi
|
||||||
|
local_id=$((local_id + 1))
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "loader=john"
|
echo "loader=john"
|
||||||
@@ -189,14 +195,51 @@ CHOSEN_FORMAT=$(choose_format) || {
|
|||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "format=${CHOSEN_FORMAT}"
|
run_john_loop() {
|
||||||
|
opencl_id="$1"
|
||||||
|
deadline="$2"
|
||||||
|
round=0
|
||||||
|
while :; do
|
||||||
|
now=$(date +%s)
|
||||||
|
remaining=$((deadline - now))
|
||||||
|
if [ "${remaining}" -le 0 ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
round=$((round + 1))
|
||||||
|
slice="${remaining}"
|
||||||
|
if [ "${slice}" -gt "${TEST_SLICE_SECONDS}" ]; then
|
||||||
|
slice="${TEST_SLICE_SECONDS}"
|
||||||
|
fi
|
||||||
|
echo "device=${opencl_id} round=${round} remaining_sec=${remaining} slice_sec=${slice}"
|
||||||
|
./john --test="${slice}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" || return 1
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
PIDS=""
|
PIDS=""
|
||||||
|
cleanup() {
|
||||||
|
rc=$?
|
||||||
|
trap - EXIT INT TERM
|
||||||
|
for pid in ${PIDS}; do
|
||||||
|
kill "${pid}" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
for pid in ${PIDS}; do
|
||||||
|
wait "${pid}" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
exit "${rc}"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT INT TERM
|
||||||
|
|
||||||
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
|
echo "target_seconds=${DURATION_SEC}"
|
||||||
|
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||||
|
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
||||||
_first=1
|
_first=1
|
||||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
[ "${_first}" = "1" ] || sleep 3
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
_first=0
|
_first=0
|
||||||
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
||||||
PIDS="${PIDS} $!"
|
pid=$!
|
||||||
|
PIDS="${PIDS} ${pid}"
|
||||||
done
|
done
|
||||||
FAIL=0
|
FAIL=0
|
||||||
for pid in ${PIDS}; do
|
for pid in ${PIDS}; do
|
||||||
|
|||||||
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
|
|||||||
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||||
echo "iters=${ITERS}"
|
echo "iters=${ITERS}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
deadline=$(( $(date +%s) + SECONDS ))
|
deadline=$(( $(date +%s) + SECONDS ))
|
||||||
round=0
|
round=0
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,19 @@ NVIDIA_KO_DIR="/usr/local/lib/nvidia"
|
|||||||
|
|
||||||
log() { echo "[bee-nvidia] $*"; }
|
log() { echo "[bee-nvidia] $*"; }
|
||||||
|
|
||||||
|
read_nvidia_modules_flavor() {
|
||||||
|
if [ -f /etc/bee-nvidia-modules-flavor ]; then
|
||||||
|
flavor="$(tr -d '[:space:]' </etc/bee-nvidia-modules-flavor 2>/dev/null)"
|
||||||
|
case "$flavor" in
|
||||||
|
open|proprietary)
|
||||||
|
echo "$flavor"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
echo "proprietary"
|
||||||
|
}
|
||||||
|
|
||||||
log "kernel: $(uname -r)"
|
log "kernel: $(uname -r)"
|
||||||
|
|
||||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
||||||
@@ -40,6 +53,8 @@ if [ -z "$nvidia_mode" ]; then
|
|||||||
nvidia_mode="normal"
|
nvidia_mode="normal"
|
||||||
fi
|
fi
|
||||||
log "boot mode: $nvidia_mode"
|
log "boot mode: $nvidia_mode"
|
||||||
|
nvidia_modules_flavor="$(read_nvidia_modules_flavor)"
|
||||||
|
log "modules flavor: $nvidia_modules_flavor"
|
||||||
|
|
||||||
load_module() {
|
load_module() {
|
||||||
mod="$1"
|
mod="$1"
|
||||||
@@ -50,11 +65,93 @@ load_module() {
|
|||||||
log "WARN: not found: $ko"
|
log "WARN: not found: $ko"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
if insmod "$ko" "$@"; then
|
if timeout 90 insmod "$ko" "$@"; then
|
||||||
log "loaded: $mod $*"
|
log "loaded: $mod $*"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
log "WARN: failed to load: $mod"
|
log "WARN: failed to load: $mod (exit $?)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
nvidia_is_functional() {
|
||||||
|
grep -q ' nvidiactl$' /proc/devices 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
load_module_with_gsp_fallback() {
|
||||||
|
ko="$NVIDIA_KO_DIR/nvidia.ko"
|
||||||
|
if [ ! -f "$ko" ]; then
|
||||||
|
log "ERROR: not found: $ko"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run insmod in background — on some converted SXM→PCIe cards GSP enters an
|
||||||
|
# infinite crash/reload loop and insmod never returns. We check for successful
|
||||||
|
# initialization by polling /proc/devices for nvidiactl instead of waiting for
|
||||||
|
# insmod to exit.
|
||||||
|
log "loading nvidia (GSP enabled, timeout 90s)"
|
||||||
|
insmod "$ko" &
|
||||||
|
_insmod_pid=$!
|
||||||
|
|
||||||
|
_waited=0
|
||||||
|
while [ $_waited -lt 90 ]; do
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
echo "gsp-on" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
# Check if insmod exited with an error before timeout
|
||||||
|
if ! kill -0 "$_insmod_pid" 2>/dev/null; then
|
||||||
|
wait "$_insmod_pid"
|
||||||
|
_rc=$?
|
||||||
|
if [ $_rc -ne 0 ]; then
|
||||||
|
log "nvidia load failed (exit $_rc)"
|
||||||
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
# insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
|
||||||
|
sleep 2
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod exited 0 but nvidiactl missing — treating as failure"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
_waited=$((_waited + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
|
||||||
|
log "nvidia GSP init timed out after 90s"
|
||||||
|
kill "$_insmod_pid" 2>/dev/null || true
|
||||||
|
wait "$_insmod_pid" 2>/dev/null || true
|
||||||
|
|
||||||
|
# Attempt to unload the partially-initialized module
|
||||||
|
if ! rmmod nvidia 2>/dev/null; then
|
||||||
|
# Module is stuck in the kernel — cannot reload with different params.
|
||||||
|
# User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
|
||||||
|
log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
|
||||||
|
log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
|
||||||
|
echo "gsp-stuck" > /run/bee-nvidia-mode
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 2
|
||||||
|
log "retrying with NVreg_EnableGpuFirmware=0"
|
||||||
|
log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
|
||||||
|
|
||||||
|
if insmod "$ko" NVreg_EnableGpuFirmware=0; then
|
||||||
|
if nvidia_is_functional; then
|
||||||
|
log "loaded: nvidia (GSP disabled)"
|
||||||
|
echo "gsp-off" > /run/bee-nvidia-mode
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "insmod gsp-off exited 0 but nvidiactl missing"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "nvidia load failed (GSP=off)"
|
||||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
@@ -68,37 +165,54 @@ load_host_module() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
case "$nvidia_mode" in
|
if [ "$nvidia_modules_flavor" = "open" ]; then
|
||||||
normal|full)
|
case "$nvidia_mode" in
|
||||||
if ! load_module nvidia; then
|
gsp-off|safe|nomsi)
|
||||||
exit 1
|
log "ignoring boot mode ${nvidia_mode} for open NVIDIA modules"
|
||||||
fi
|
;;
|
||||||
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
esac
|
||||||
# exported by the generic "video" module. Best-effort only; compute paths
|
if ! load_module nvidia; then
|
||||||
# remain functional even if display-related modules stay absent.
|
exit 1
|
||||||
load_host_module video || true
|
fi
|
||||||
load_module nvidia-modeset || true
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||||
load_module nvidia-uvm || true
|
# exported by the generic "video" module. Best-effort only; compute paths
|
||||||
;;
|
# remain functional even if display-related modules stay absent.
|
||||||
gsp-off|safe)
|
load_host_module video || true
|
||||||
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
load_module nvidia-modeset || true
|
||||||
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
load_module nvidia-uvm || true
|
||||||
# conservative path for platforms where full boot-time GSP init is unstable.
|
else
|
||||||
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
|
case "$nvidia_mode" in
|
||||||
exit 1
|
normal|full)
|
||||||
fi
|
if ! load_module_with_gsp_fallback; then
|
||||||
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
exit 1
|
||||||
;;
|
fi
|
||||||
nomsi|*)
|
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||||
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
# exported by the generic "video" module. Best-effort only; compute paths
|
||||||
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
# remain functional even if display-related modules stay absent.
|
||||||
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
load_host_module video || true
|
||||||
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
load_module nvidia-modeset || true
|
||||||
exit 1
|
load_module nvidia-uvm || true
|
||||||
fi
|
;;
|
||||||
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
gsp-off|safe)
|
||||||
;;
|
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
||||||
esac
|
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
||||||
|
# conservative path for platforms where full boot-time GSP init is unstable.
|
||||||
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
||||||
|
;;
|
||||||
|
nomsi|*)
|
||||||
|
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
||||||
|
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
||||||
|
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
||||||
|
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
||||||
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
|
nvidia_major=$(grep -m1 ' nvidiactl$' /proc/devices | awk '{print $1}')
|
||||||
@@ -127,14 +241,45 @@ fi
|
|||||||
ldconfig 2>/dev/null || true
|
ldconfig 2>/dev/null || true
|
||||||
log "ldconfig refreshed"
|
log "ldconfig refreshed"
|
||||||
|
|
||||||
|
# Keep persistence mode enabled across the session so dcgmi / stress tools do
|
||||||
|
# not fail with deployment warnings on otherwise healthy GPUs.
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||||
|
log "enabled NVIDIA persistence mode"
|
||||||
|
else
|
||||||
|
log "WARN: failed to enable NVIDIA persistence mode"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||||
|
fi
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# "group is empty" even when GPUs and modules are present.
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
|
# keep a stale empty inventory and dcgmi diag later reports no testable entities.
|
||||||
if command -v nv-hostengine >/dev/null 2>&1; then
|
if command -v nv-hostengine >/dev/null 2>&1; then
|
||||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
log "nv-hostengine already running — skipping"
|
if command -v pkill >/dev/null 2>&1; then
|
||||||
else
|
pkill -x nv-hostengine >/dev/null 2>&1 || true
|
||||||
|
tries=0
|
||||||
|
while pgrep -x nv-hostengine >/dev/null 2>&1; do
|
||||||
|
tries=$((tries + 1))
|
||||||
|
if [ "${tries}" -ge 10 ]; then
|
||||||
|
log "WARN: nv-hostengine is still running after restart request"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
log "WARN: keeping existing nv-hostengine process"
|
||||||
|
else
|
||||||
|
log "nv-hostengine restarted"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: pkill not found — cannot refresh nv-hostengine inventory"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if ! pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
nv-hostengine
|
nv-hostengine
|
||||||
log "nv-hostengine started"
|
log "nv-hostengine started"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -7,16 +7,24 @@ xset s off
|
|||||||
xset -dpms
|
xset -dpms
|
||||||
xset s noblank
|
xset s noblank
|
||||||
|
|
||||||
|
# Set desktop background.
|
||||||
|
if [ -f /usr/share/bee/wallpaper.png ]; then
|
||||||
|
feh --bg-fill /usr/share/bee/wallpaper.png
|
||||||
|
else
|
||||||
|
xsetroot -solid '#f6c90e'
|
||||||
|
fi
|
||||||
|
|
||||||
tint2 &
|
tint2 &
|
||||||
|
|
||||||
# Wait up to 120s for bee-web to bind. The web server starts immediately now
|
# Wait up to 60s for bee-web before opening Chromium.
|
||||||
# (audit is deferred), so this should succeed in a few seconds on most hardware.
|
# Without this Chromium gets connection-refused and shows a blank page.
|
||||||
i=0
|
_i=0
|
||||||
while [ $i -lt 120 ]; do
|
while [ $_i -lt 60 ]; do
|
||||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
curl -sf http://localhost/healthz >/dev/null 2>&1 && break
|
||||||
sleep 1
|
sleep 1
|
||||||
i=$((i+1))
|
_i=$((_i+1))
|
||||||
done
|
done
|
||||||
|
unset _i
|
||||||
|
|
||||||
chromium \
|
chromium \
|
||||||
--disable-infobars \
|
--disable-infobars \
|
||||||
@@ -24,7 +32,8 @@ chromium \
|
|||||||
--no-first-run \
|
--no-first-run \
|
||||||
--disable-session-crashed-bubble \
|
--disable-session-crashed-bubble \
|
||||||
--disable-features=TranslateUI \
|
--disable-features=TranslateUI \
|
||||||
|
--user-data-dir=/tmp/bee-chrome \
|
||||||
--start-maximized \
|
--start-maximized \
|
||||||
http://localhost/ &
|
http://localhost/loading &
|
||||||
|
|
||||||
exec openbox
|
exec openbox
|
||||||
|
|||||||
99
iso/overlay/usr/local/bin/bee-selfheal
Normal file
99
iso/overlay/usr/local/bin/bee-selfheal
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# bee-selfheal — periodic best-effort recovery for critical live ISO services.
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
LOG_PREFIX="bee-selfheal"
|
||||||
|
EXPORT_DIR="/appdata/bee/export"
|
||||||
|
AUDIT_JSON="${EXPORT_DIR}/bee-audit.json"
|
||||||
|
RUNTIME_JSON="${EXPORT_DIR}/runtime-health.json"
|
||||||
|
LOCK_DIR="/run/bee-selfheal.lock"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[${LOG_PREFIX}] $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
have_nvidia_gpu() {
|
||||||
|
lspci -nn 2>/dev/null | grep -qi '10de:'
|
||||||
|
}
|
||||||
|
|
||||||
|
service_active() {
|
||||||
|
systemctl is-active --quiet "$1" 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_service() {
|
||||||
|
local svc="$1"
|
||||||
|
if systemctl restart "$svc" >/dev/null 2>&1; then
|
||||||
|
log "restarted ${svc}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
log "WARN: failed to restart ${svc}"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
file_ready() {
|
||||||
|
[ -s "$1" ]
|
||||||
|
}
|
||||||
|
|
||||||
|
artifact_state() {
|
||||||
|
local path="$1"
|
||||||
|
if [ -s "${path}" ]; then
|
||||||
|
echo "ready"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
if [ -e "${path}.tmp" ]; then
|
||||||
|
echo "interrupted"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
echo "missing"
|
||||||
|
}
|
||||||
|
|
||||||
|
web_healthy() {
|
||||||
|
bash -c 'exec 3<>/dev/tcp/127.0.0.1/80 && printf "GET /healthz HTTP/1.0\r\nHost: localhost\r\n\r\n" >&3 && grep -q "^ok$" <&3' \
|
||||||
|
>/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
mkdir -p "${EXPORT_DIR}" /run
|
||||||
|
|
||||||
|
if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||||||
|
log "another self-heal run is already active"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||||
|
|
||||||
|
log "start"
|
||||||
|
|
||||||
|
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||||
|
log "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||||
|
restart_service bee-nvidia.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
runtime_state="$(artifact_state "${RUNTIME_JSON}")"
|
||||||
|
if [ "${runtime_state}" != "ready" ]; then
|
||||||
|
if [ "${runtime_state}" = "interrupted" ]; then
|
||||||
|
log "runtime-health.json.tmp exists — interrupted runtime-health write detected"
|
||||||
|
else
|
||||||
|
log "runtime-health.json missing or empty"
|
||||||
|
fi
|
||||||
|
restart_service bee-preflight.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
audit_state="$(artifact_state "${AUDIT_JSON}")"
|
||||||
|
if [ "${audit_state}" != "ready" ]; then
|
||||||
|
if [ "${audit_state}" = "interrupted" ]; then
|
||||||
|
log "bee-audit.json.tmp exists — interrupted audit write detected"
|
||||||
|
else
|
||||||
|
log "bee-audit.json missing or empty"
|
||||||
|
fi
|
||||||
|
restart_service bee-audit.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! service_active bee-web.service; then
|
||||||
|
log "bee-web.service is not active"
|
||||||
|
restart_service bee-web.service || true
|
||||||
|
elif ! web_healthy; then
|
||||||
|
log "bee-web health check failed"
|
||||||
|
restart_service bee-web.service || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "done"
|
||||||
Reference in New Issue
Block a user