Files
bee/audit/cmd/bee/main.go

484 lines
13 KiB
Go

package main
import (
"context"
"flag"
"fmt"
"io"
"log/slog"
"os"
"runtime/debug"
"strconv"
"strings"
"bee/audit/internal/app"
"bee/audit/internal/platform"
"bee/audit/internal/runtimeenv"
"bee/audit/internal/webui"
)
var Version = "dev"
func buildLabel() string {
label := strings.TrimSpace(Version)
if label == "" {
return "dev"
}
return label
}
func main() {
os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
}
func run(args []string, stdout, stderr io.Writer) (exitCode int) {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
Level: slog.LevelInfo,
})))
defer func() {
if rec := recover(); rec != nil {
slog.Error("fatal panic",
"panic", fmt.Sprint(rec),
"stack", string(debug.Stack()),
)
exitCode = 1
}
}()
if len(args) == 0 {
printRootUsage(stderr)
return 2
}
switch args[0] {
case "help", "--help", "-h":
if len(args) > 1 {
return runHelp(args[1:], stdout, stderr)
}
printRootUsage(stdout)
return 0
case "audit":
return runAudit(args[1:], stdout, stderr)
case "export":
return runExport(args[1:], stdout, stderr)
case "preflight":
return runPreflight(args[1:], stdout, stderr)
case "support-bundle":
return runSupportBundle(args[1:], stdout, stderr)
case "web":
return runWeb(args[1:], stdout, stderr)
case "sat":
return runSAT(args[1:], stdout, stderr)
case "benchmark":
return runBenchmark(args[1:], stdout, stderr)
case "version", "--version", "-version":
fmt.Fprintln(stdout, Version)
return 0
default:
fmt.Fprintf(stderr, "bee: unknown command %q\n\n", args[0])
printRootUsage(stderr)
return 2
}
}
func printRootUsage(w io.Writer) {
fmt.Fprintln(w, `bee commands:
bee audit --runtime auto|local|livecd --output stdout|file:<path>
bee preflight --output stdout|file:<path>
bee export --target <device>
bee support-bundle --output stdout|file:<path>
bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
bee benchmark nvidia [--profile standard|stability|overnight]
bee version
bee help [command]`)
}
func runHelp(args []string, stdout, stderr io.Writer) int {
switch args[0] {
case "audit":
return runAudit([]string{"--help"}, stdout, stdout)
case "export":
return runExport([]string{"--help"}, stdout, stdout)
case "preflight":
return runPreflight([]string{"--help"}, stdout, stdout)
case "support-bundle":
return runSupportBundle([]string{"--help"}, stdout, stdout)
case "web":
return runWeb([]string{"--help"}, stdout, stdout)
case "sat":
return runSAT([]string{"--help"}, stdout, stderr)
case "benchmark":
return runBenchmark([]string{"--help"}, stdout, stderr)
case "version":
fmt.Fprintln(stdout, "usage: bee version")
return 0
default:
fmt.Fprintf(stderr, "bee help: unknown command %q\n\n", args[0])
printRootUsage(stderr)
return 2
}
}
func runAudit(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("audit", flag.ContinueOnError)
fs.SetOutput(stderr)
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
showVersion := fs.Bool("version", false, "print version and exit")
fs.Usage = func() {
fmt.Fprintln(stderr, "usage: bee audit [--runtime auto|local|livecd] [--output stdout|file:<path>]")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2
}
if *showVersion {
fmt.Fprintln(stdout, Version)
return 0
}
runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
if err != nil {
slog.Error("resolve runtime", "err", err)
return 1
}
slog.Info("runtime resolved", "mode", runtimeInfo.Mode, "reason", runtimeInfo.Reason)
application := app.New(platform.New())
path, err := application.RunAudit(runtimeInfo.Mode, *output)
if err != nil {
slog.Error("run audit", "err", err)
return 1
}
if path != "stdout" {
slog.Info("audit output written", "path", path)
}
return 0
}
func runExport(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("export", flag.ContinueOnError)
fs.SetOutput(stderr)
targetDevice := fs.String("target", "", "removable device path, e.g. /dev/sdb1")
fs.Usage = func() {
fmt.Fprintln(stderr, "usage: bee export --target <device>")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2
}
if strings.TrimSpace(*targetDevice) == "" {
fmt.Fprintln(stderr, "bee export: --target is required")
fs.Usage()
return 2
}
application := app.New(platform.New())
targets, err := application.ListRemovableTargets()
if err != nil {
slog.Error("list removable targets", "err", err)
return 1
}
for _, target := range targets {
if target.Device == *targetDevice {
path, err := application.ExportLatestAudit(target)
if err != nil {
slog.Error("export latest audit", "err", err)
return 1
}
slog.Info("audit exported", "path", path)
return 0
}
}
slog.Error("target device not found among removable filesystems", "device", *targetDevice)
return 1
}
func runPreflight(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("preflight", flag.ContinueOnError)
fs.SetOutput(stderr)
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
fs.Usage = func() {
fmt.Fprintf(stderr, "usage: bee preflight [--output stdout|file:%s]\n", app.DefaultRuntimeJSONPath)
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2
}
application := app.New(platform.New())
path, err := application.RunRuntimePreflight(*output)
if err != nil {
slog.Error("run preflight", "err", err)
return 1
}
if path != "stdout" {
slog.Info("runtime health written", "path", path)
}
return 0
}
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
fs.SetOutput(stderr)
output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
fs.Usage = func() {
fmt.Fprintln(stderr, "usage: bee support-bundle [--output stdout|file:<path>]")
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2
}
path, err := app.BuildSupportBundle(app.DefaultExportDir)
if err != nil {
slog.Error("build support bundle", "err", err)
return 1
}
defer os.Remove(path)
raw, err := os.ReadFile(path)
if err != nil {
slog.Error("read support bundle", "err", err)
return 1
}
switch {
case *output == "stdout":
if _, err := stdout.Write(raw); err != nil {
slog.Error("write support bundle stdout", "err", err)
return 1
}
case strings.HasPrefix(*output, "file:"):
dst := strings.TrimPrefix(*output, "file:")
if err := os.WriteFile(dst, raw, 0644); err != nil {
slog.Error("write support bundle", "err", err)
return 1
}
slog.Info("support bundle written", "path", dst)
default:
fmt.Fprintln(stderr, "bee support-bundle: unknown output destination")
fs.Usage()
return 2
}
return 0
}
func runWeb(args []string, stdout, stderr io.Writer) int {
fs := flag.NewFlagSet("web", flag.ContinueOnError)
fs.SetOutput(stderr)
listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
title := fs.String("title", "Bee Hardware Audit", "page title")
fs.Usage = func() {
fmt.Fprintf(stderr, "usage: bee web [--listen :80] [--audit-path %s] [--export-dir %s] [--title \"Bee Hardware Audit\"]\n", app.DefaultAuditJSONPath, app.DefaultExportDir)
fs.PrintDefaults()
}
if err := fs.Parse(args); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fs.Usage()
return 2
}
slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)
runtimeInfo, err := runtimeenv.Detect("auto")
if err != nil {
slog.Warn("resolve runtime for web", "err", err)
}
if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
Title: *title,
BuildLabel: buildLabel(),
AuditPath: *auditPath,
ExportDir: *exportDir,
App: app.New(platform.New()),
RuntimeMode: runtimeInfo.Mode,
}); err != nil {
slog.Error("run web", "err", err)
return 1
}
return 0
}
func runSAT(args []string, stdout, stderr io.Writer) int {
if len(args) == 0 {
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
return 2
}
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
return 0
}
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
fs.SetOutput(stderr)
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
if err := fs.Parse(args[1:]); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fmt.Fprintf(stderr, "bee sat: unexpected arguments\n")
return 2
}
target := args[0]
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
return 2
}
application := app.New(platform.New())
var (
archive string
err error
)
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
switch target {
case "nvidia":
level := *diagLevel
if level > 0 {
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
} else {
archive, err = application.RunNvidiaAcceptancePack("", logLine)
}
case "memory":
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
case "storage":
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
case "cpu":
dur := *duration
if dur <= 0 {
dur = 60
}
archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
}
if err != nil {
slog.Error("run sat", "target", target, "err", err)
return 1
}
slog.Info("sat archive written", "target", target, "path", archive)
return 0
}
func runBenchmark(args []string, stdout, stderr io.Writer) int {
if len(args) == 0 {
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 2
}
if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 0
}
target := args[0]
if target != "nvidia" {
fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
return 2
}
fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
fs.SetOutput(stderr)
profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
devices := fs.String("devices", "", "comma-separated GPU indices to include")
exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
if err := fs.Parse(args[1:]); err != nil {
if err == flag.ErrHelp {
return 0
}
return 2
}
if fs.NArg() != 0 {
fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
return 2
}
includeIndices, err := parseBenchmarkIndexCSV(*devices)
if err != nil {
fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
return 2
}
excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
if err != nil {
fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
return 2
}
application := app.New(platform.New())
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
Profile: *profile,
SizeMB: *sizeMB,
GPUIndices: includeIndices,
ExcludeGPUIndices: excludeIndices,
RunNCCL: !*skipNCCL,
}, logLine)
if err != nil {
slog.Error("run benchmark", "target", target, "err", err)
return 1
}
slog.Info("benchmark archive written", "target", target, "path", archive)
return 0
}
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
raw = strings.TrimSpace(raw)
if raw == "" {
return nil, nil
}
var indices []int
for _, part := range strings.Split(raw, ",") {
part = strings.TrimSpace(part)
if part == "" {
continue
}
value, err := strconv.Atoi(part)
if err != nil || value < 0 {
return nil, fmt.Errorf("bad gpu index %q", part)
}
indices = append(indices, value)
}
return indices, nil
}