Add NVIDIA benchmark reporting flow

2026-04-05 10:30:56 +03:00
parent 143b7dca5d
commit bf47c8dbd2
9 changed files with 1559 additions and 40 deletions
--- a/audit/cmd/bee/main.go
+++ b/audit/cmd/bee/main.go
@@ -8,6 +8,7 @@ import (
 	"log/slog"
 	"os"
 	"runtime/debug"
+	"strconv"
 	"strings"

 	"bee/audit/internal/app"
@@ -35,15 +36,13 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 		Level: slog.LevelInfo,
 	})))
 	defer func() {
-		rec := recover()
-		if rec == nil {
-			return
+		if rec := recover(); rec != nil {
+			slog.Error("fatal panic",
+				"panic", fmt.Sprint(rec),
+				"stack", string(debug.Stack()),
+			)
+			exitCode = 1
 		}
-		slog.Error("fatal panic",
-			"panic", fmt.Sprint(rec),
-			"stack", string(debug.Stack()),
-		)
-		exitCode = 1
 	}()

 	if len(args) == 0 {
@@ -70,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
 		return runWeb(args[1:], stdout, stderr)
 	case "sat":
 		return runSAT(args[1:], stdout, stderr)
+	case "benchmark":
+		return runBenchmark(args[1:], stdout, stderr)
 	case "version", "--version", "-version":
 		fmt.Fprintln(stdout, Version)
 		return 0
@@ -88,6 +89,7 @@ func printRootUsage(w io.Writer) {
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
+  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
 }
@@ -106,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
 		return runWeb([]string{"--help"}, stdout, stdout)
 	case "sat":
 		return runSAT([]string{"--help"}, stdout, stderr)
+	case "benchmark":
+		return runBenchmark([]string{"--help"}, stdout, stderr)
 	case "version":
 		fmt.Fprintln(stdout, "usage: bee version")
 		return 0
@@ -395,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
 	slog.Info("sat archive written", "target", target, "path", archive)
 	return 0
 }
+
+func runBenchmark(args []string, stdout, stderr io.Writer) int {
+	if len(args) == 0 {
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
+		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 0
+	}
+	target := args[0]
+	if target != "nvidia" {
+		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
+		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
+		return 2
+	}
+
+	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
+	devices := fs.String("devices", "", "comma-separated GPU indices to include")
+	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
+	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
+	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
+	if err := fs.Parse(args[1:]); err != nil {
+		if err == flag.ErrHelp {
+			return 0
+		}
+		return 2
+	}
+	if fs.NArg() != 0 {
+		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
+		return 2
+	}
+
+	includeIndices, err := parseBenchmarkIndexCSV(*devices)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
+		return 2
+	}
+	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
+	if err != nil {
+		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
+		return 2
+	}
+
+	application := app.New(platform.New())
+	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
+	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
+		Profile:           *profile,
+		SizeMB:            *sizeMB,
+		GPUIndices:        includeIndices,
+		ExcludeGPUIndices: excludeIndices,
+		RunNCCL:           !*skipNCCL,
+	}, logLine)
+	if err != nil {
+		slog.Error("run benchmark", "target", target, "err", err)
+		return 1
+	}
+	slog.Info("benchmark archive written", "target", target, "path", archive)
+	return 0
+}
+
+func parseBenchmarkIndexCSV(raw string) ([]int, error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return nil, nil
+	}
+	var indices []int
+	for _, part := range strings.Split(raw, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		value, err := strconv.Atoi(part)
+		if err != nil || value < 0 {
+			return nil, fmt.Errorf("bad gpu index %q", part)
+		}
+		indices = append(indices, value)
+	}
+	return indices, nil
+}