484 lines
13 KiB
Go
484 lines
13 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"os"
|
|
"runtime/debug"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"bee/audit/internal/app"
|
|
"bee/audit/internal/platform"
|
|
"bee/audit/internal/runtimeenv"
|
|
"bee/audit/internal/webui"
|
|
)
|
|
|
|
var Version = "dev"
|
|
|
|
func buildLabel() string {
|
|
label := strings.TrimSpace(Version)
|
|
if label == "" {
|
|
return "dev"
|
|
}
|
|
return label
|
|
}
|
|
|
|
func main() {
|
|
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
|
|
}
|
|
|
|
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
|
Level: slog.LevelInfo,
|
|
})))
|
|
defer func() {
|
|
if rec := recover(); rec != nil {
|
|
slog.Error("fatal panic",
|
|
"panic", fmt.Sprint(rec),
|
|
"stack", string(debug.Stack()),
|
|
)
|
|
exitCode = 1
|
|
}
|
|
}()
|
|
|
|
if len(args) == 0 {
|
|
printRootUsage(stderr)
|
|
return 2
|
|
}
|
|
|
|
switch args[0] {
|
|
case "help", "--help", "-h":
|
|
if len(args) > 1 {
|
|
return runHelp(args[1:], stdout, stderr)
|
|
}
|
|
printRootUsage(stdout)
|
|
return 0
|
|
case "audit":
|
|
return runAudit(args[1:], stdout, stderr)
|
|
case "export":
|
|
return runExport(args[1:], stdout, stderr)
|
|
case "preflight":
|
|
return runPreflight(args[1:], stdout, stderr)
|
|
case "support-bundle":
|
|
return runSupportBundle(args[1:], stdout, stderr)
|
|
case "web":
|
|
return runWeb(args[1:], stdout, stderr)
|
|
case "sat":
|
|
return runSAT(args[1:], stdout, stderr)
|
|
case "benchmark":
|
|
return runBenchmark(args[1:], stdout, stderr)
|
|
case "version", "--version", "-version":
|
|
fmt.Fprintln(stdout, Version)
|
|
return 0
|
|
default:
|
|
fmt.Fprintf(stderr, "bee: unknown command %q\n\n", args[0])
|
|
printRootUsage(stderr)
|
|
return 2
|
|
}
|
|
}
|
|
|
|
func printRootUsage(w io.Writer) {
|
|
fmt.Fprintln(w, `bee commands:
|
|
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
|
bee preflight --output stdout|file:<path>
|
|
bee export --target <device>
|
|
bee support-bundle --output stdout|file:<path>
|
|
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
|
|
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
|
bee benchmark nvidia [--profile standard|stability|overnight]
|
|
bee version
|
|
bee help [command]`)
|
|
}
|
|
|
|
func runHelp(args []string, stdout, stderr io.Writer) int {
|
|
switch args[0] {
|
|
case "audit":
|
|
return runAudit([]string{"--help"}, stdout, stdout)
|
|
case "export":
|
|
return runExport([]string{"--help"}, stdout, stdout)
|
|
case "preflight":
|
|
return runPreflight([]string{"--help"}, stdout, stdout)
|
|
case "support-bundle":
|
|
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
|
case "web":
|
|
return runWeb([]string{"--help"}, stdout, stdout)
|
|
case "sat":
|
|
return runSAT([]string{"--help"}, stdout, stderr)
|
|
case "benchmark":
|
|
return runBenchmark([]string{"--help"}, stdout, stderr)
|
|
case "version":
|
|
fmt.Fprintln(stdout, "usage: bee version")
|
|
return 0
|
|
default:
|
|
fmt.Fprintf(stderr, "bee help: unknown command %q\n\n", args[0])
|
|
printRootUsage(stderr)
|
|
return 2
|
|
}
|
|
}
|
|
|
|
func runAudit(args []string, stdout, stderr io.Writer) int {
|
|
fs := flag.NewFlagSet("audit", flag.ContinueOnError)
|
|
fs.SetOutput(stderr)
|
|
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
|
runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
|
|
showVersion := fs.Bool("version", false, "print version and exit")
|
|
fs.Usage = func() {
|
|
fmt.Fprintln(stderr, "usage: bee audit [--runtime auto|local|livecd] [--output stdout|file:<path>]")
|
|
fs.PrintDefaults()
|
|
}
|
|
if err := fs.Parse(args); err != nil {
|
|
if err == flag.ErrHelp {
|
|
return 0
|
|
}
|
|
return 2
|
|
}
|
|
if fs.NArg() != 0 {
|
|
fs.Usage()
|
|
return 2
|
|
}
|
|
if *showVersion {
|
|
fmt.Fprintln(stdout, Version)
|
|
return 0
|
|
}
|
|
|
|
runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
|
|
if err != nil {
|
|
slog.Error("resolve runtime", "err", err)
|
|
return 1
|
|
}
|
|
slog.Info("runtime resolved", "mode", runtimeInfo.Mode, "reason", runtimeInfo.Reason)
|
|
|
|
application := app.New(platform.New())
|
|
path, err := application.RunAudit(runtimeInfo.Mode, *output)
|
|
if err != nil {
|
|
slog.Error("run audit", "err", err)
|
|
return 1
|
|
}
|
|
if path != "stdout" {
|
|
slog.Info("audit output written", "path", path)
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func runExport(args []string, stdout, stderr io.Writer) int {
|
|
fs := flag.NewFlagSet("export", flag.ContinueOnError)
|
|
fs.SetOutput(stderr)
|
|
targetDevice := fs.String("target", "", "removable device path, e.g. /dev/sdb1")
|
|
fs.Usage = func() {
|
|
fmt.Fprintln(stderr, "usage: bee export --target <device>")
|
|
fs.PrintDefaults()
|
|
}
|
|
if err := fs.Parse(args); err != nil {
|
|
if err == flag.ErrHelp {
|
|
return 0
|
|
}
|
|
return 2
|
|
}
|
|
if fs.NArg() != 0 {
|
|
fs.Usage()
|
|
return 2
|
|
}
|
|
if strings.TrimSpace(*targetDevice) == "" {
|
|
fmt.Fprintln(stderr, "bee export: --target is required")
|
|
fs.Usage()
|
|
return 2
|
|
}
|
|
|
|
application := app.New(platform.New())
|
|
targets, err := application.ListRemovableTargets()
|
|
if err != nil {
|
|
slog.Error("list removable targets", "err", err)
|
|
return 1
|
|
}
|
|
|
|
for _, target := range targets {
|
|
if target.Device == *targetDevice {
|
|
path, err := application.ExportLatestAudit(target)
|
|
if err != nil {
|
|
slog.Error("export latest audit", "err", err)
|
|
return 1
|
|
}
|
|
slog.Info("audit exported", "path", path)
|
|
return 0
|
|
}
|
|
}
|
|
|
|
slog.Error("target device not found among removable filesystems", "device", *targetDevice)
|
|
return 1
|
|
}
|
|
|
|
func runPreflight(args []string, stdout, stderr io.Writer) int {
|
|
fs := flag.NewFlagSet("preflight", flag.ContinueOnError)
|
|
fs.SetOutput(stderr)
|
|
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
|
fs.Usage = func() {
|
|
fmt.Fprintf(stderr, "usage: bee preflight [--output stdout|file:%s]\n", app.DefaultRuntimeJSONPath)
|
|
fs.PrintDefaults()
|
|
}
|
|
if err := fs.Parse(args); err != nil {
|
|
if err == flag.ErrHelp {
|
|
return 0
|
|
}
|
|
return 2
|
|
}
|
|
if fs.NArg() != 0 {
|
|
fs.Usage()
|
|
return 2
|
|
}
|
|
application := app.New(platform.New())
|
|
path, err := application.RunRuntimePreflight(*output)
|
|
if err != nil {
|
|
slog.Error("run preflight", "err", err)
|
|
return 1
|
|
}
|
|
if path != "stdout" {
|
|
slog.Info("runtime health written", "path", path)
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
|
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
|
fs.SetOutput(stderr)
|
|
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
|
|
fs.Usage = func() {
|
|
fmt.Fprintln(stderr, "usage: bee support-bundle [--output stdout|file:<path>]")
|
|
fs.PrintDefaults()
|
|
}
|
|
if err := fs.Parse(args); err != nil {
|
|
if err == flag.ErrHelp {
|
|
return 0
|
|
}
|
|
return 2
|
|
}
|
|
if fs.NArg() != 0 {
|
|
fs.Usage()
|
|
return 2
|
|
}
|
|
path, err := app.BuildSupportBundle(app.DefaultExportDir)
|
|
if err != nil {
|
|
slog.Error("build support bundle", "err", err)
|
|
return 1
|
|
}
|
|
defer os.Remove(path)
|
|
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
slog.Error("read support bundle", "err", err)
|
|
return 1
|
|
}
|
|
switch {
|
|
case *output == "stdout":
|
|
if _, err := stdout.Write(raw); err != nil {
|
|
slog.Error("write support bundle stdout", "err", err)
|
|
return 1
|
|
}
|
|
case strings.HasPrefix(*output, "file:"):
|
|
dst := strings.TrimPrefix(*output, "file:")
|
|
if err := os.WriteFile(dst, raw, 0644); err != nil {
|
|
slog.Error("write support bundle", "err", err)
|
|
return 1
|
|
}
|
|
slog.Info("support bundle written", "path", dst)
|
|
default:
|
|
fmt.Fprintln(stderr, "bee support-bundle: unknown output destination")
|
|
fs.Usage()
|
|
return 2
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func runWeb(args []string, stdout, stderr io.Writer) int {
|
|
fs := flag.NewFlagSet("web", flag.ContinueOnError)
|
|
fs.SetOutput(stderr)
|
|
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
|
|
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
|
|
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
|
title := fs.String("title", "Bee Hardware Audit", "page title")
|
|
fs.Usage = func() {
|
|
fmt.Fprintf(stderr, "usage: bee web [--listen :80] [--audit-path %s] [--export-dir %s] [--title \"Bee Hardware Audit\"]\n", app.DefaultAuditJSONPath, app.DefaultExportDir)
|
|
fs.PrintDefaults()
|
|
}
|
|
if err := fs.Parse(args); err != nil {
|
|
if err == flag.ErrHelp {
|
|
return 0
|
|
}
|
|
return 2
|
|
}
|
|
if fs.NArg() != 0 {
|
|
fs.Usage()
|
|
return 2
|
|
}
|
|
|
|
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
|
|
|
|
runtimeInfo, err := runtimeenv.Detect("auto")
|
|
if err != nil {
|
|
slog.Warn("resolve runtime for web", "err", err)
|
|
}
|
|
|
|
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
|
|
Title: *title,
|
|
BuildLabel: buildLabel(),
|
|
AuditPath: *auditPath,
|
|
ExportDir: *exportDir,
|
|
App: app.New(platform.New()),
|
|
RuntimeMode: runtimeInfo.Mode,
|
|
}); err != nil {
|
|
slog.Error("run web", "err", err)
|
|
return 1
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func runSAT(args []string, stdout, stderr io.Writer) int {
|
|
if len(args) == 0 {
|
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
|
return 2
|
|
}
|
|
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
|
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
|
return 0
|
|
}
|
|
|
|
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
|
fs.SetOutput(stderr)
|
|
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
|
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
|
if err := fs.Parse(args[1:]); err != nil {
|
|
if err == flag.ErrHelp {
|
|
return 0
|
|
}
|
|
return 2
|
|
}
|
|
if fs.NArg() != 0 {
|
|
fmt.Fprintf(stderr, "bee sat: unexpected arguments\n")
|
|
return 2
|
|
}
|
|
|
|
target := args[0]
|
|
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
|
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
|
return 2
|
|
}
|
|
|
|
application := app.New(platform.New())
|
|
var (
|
|
archive string
|
|
err error
|
|
)
|
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
|
switch target {
|
|
case "nvidia":
|
|
level := *diagLevel
|
|
if level > 0 {
|
|
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
|
} else {
|
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
|
}
|
|
case "memory":
|
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
|
case "storage":
|
|
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
|
case "cpu":
|
|
dur := *duration
|
|
if dur <= 0 {
|
|
dur = 60
|
|
}
|
|
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
|
|
}
|
|
if err != nil {
|
|
slog.Error("run sat", "target", target, "err", err)
|
|
return 1
|
|
}
|
|
slog.Info("sat archive written", "target", target, "path", archive)
|
|
return 0
|
|
}
|
|
|
|
func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
|
if len(args) == 0 {
|
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
|
return 2
|
|
}
|
|
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
|
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
|
return 0
|
|
}
|
|
target := args[0]
|
|
if target != "nvidia" {
|
|
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
|
|
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
|
|
return 2
|
|
}
|
|
|
|
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
|
|
fs.SetOutput(stderr)
|
|
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
|
|
devices := fs.String("devices", "", "comma-separated GPU indices to include")
|
|
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
|
|
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
|
|
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
|
|
if err := fs.Parse(args[1:]); err != nil {
|
|
if err == flag.ErrHelp {
|
|
return 0
|
|
}
|
|
return 2
|
|
}
|
|
if fs.NArg() != 0 {
|
|
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
|
|
return 2
|
|
}
|
|
|
|
includeIndices, err := parseBenchmarkIndexCSV(*devices)
|
|
if err != nil {
|
|
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
|
|
return 2
|
|
}
|
|
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
|
|
if err != nil {
|
|
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
|
|
return 2
|
|
}
|
|
|
|
application := app.New(platform.New())
|
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
|
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
|
|
Profile: *profile,
|
|
SizeMB: *sizeMB,
|
|
GPUIndices: includeIndices,
|
|
ExcludeGPUIndices: excludeIndices,
|
|
RunNCCL: !*skipNCCL,
|
|
}, logLine)
|
|
if err != nil {
|
|
slog.Error("run benchmark", "target", target, "err", err)
|
|
return 1
|
|
}
|
|
slog.Info("benchmark archive written", "target", target, "path", archive)
|
|
return 0
|
|
}
|
|
|
|
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
return nil, nil
|
|
}
|
|
var indices []int
|
|
for _, part := range strings.Split(raw, ",") {
|
|
part = strings.TrimSpace(part)
|
|
if part == "" {
|
|
continue
|
|
}
|
|
value, err := strconv.Atoi(part)
|
|
if err != nil || value < 0 {
|
|
return nil, fmt.Errorf("bad gpu index %q", part)
|
|
}
|
|
indices = append(indices, value)
|
|
}
|
|
return indices, nil
|
|
}
|