bee/audit/cmd/bee/main.go

package main

import (
	"context"
	"flag"
	"fmt"
	"io"
	"log/slog"
	"os"
	"runtime/debug"
	"strconv"
	"strings"

	"bee/audit/internal/app"
	"bee/audit/internal/platform"
	"bee/audit/internal/runtimeenv"
	"bee/audit/internal/webui"
)

var Version = "dev"

func buildLabel() string {
	label := strings.TrimSpace(Version)
	if label == "" {
		return "dev"
	}
	return label
}

func main() {
	os.Exit(run(os.Args[1:], os.Stdout, os.Stderr))
}

func run(args []string, stdout, stderr io.Writer) (exitCode int) {
	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
		Level: slog.LevelInfo,
	})))
	defer func() {
		if rec := recover(); rec != nil {
			slog.Error("fatal panic",
				"panic", fmt.Sprint(rec),
				"stack", string(debug.Stack()),
			)
			exitCode = 1
		}
	}()

	if len(args) == 0 {
		printRootUsage(stderr)
		return 2
	}

	switch args[0] {
	case "help", "--help", "-h":
		if len(args) > 1 {
			return runHelp(args[1:], stdout, stderr)
		}
		printRootUsage(stdout)
		return 0
	case "audit":
		return runAudit(args[1:], stdout, stderr)
	case "export":
		return runExport(args[1:], stdout, stderr)
	case "preflight":
		return runPreflight(args[1:], stdout, stderr)
	case "support-bundle":
		return runSupportBundle(args[1:], stdout, stderr)
	case "web":
		return runWeb(args[1:], stdout, stderr)
	case "sat":
		return runSAT(args[1:], stdout, stderr)
	case "benchmark":
		return runBenchmark(args[1:], stdout, stderr)
	case "version", "--version", "-version":
		fmt.Fprintln(stdout, Version)
		return 0
	default:
		fmt.Fprintf(stderr, "bee: unknown command %q\n\n", args[0])
		printRootUsage(stderr)
		return 2
	}
}

func printRootUsage(w io.Writer) {
	fmt.Fprintln(w, `bee commands:
  bee audit   --runtime auto|local|livecd --output stdout|file:<path>
  bee preflight --output stdout|file:<path>
  bee export  --target <device>
  bee support-bundle --output stdout|file:<path>
  bee web     --listen :80 --audit-path `+app.DefaultAuditJSONPath+`
  bee sat nvidia|memory|storage|cpu [--duration <seconds>]
  bee benchmark nvidia [--profile standard|stability|overnight]
  bee version
  bee help [command]`)
}

func runHelp(args []string, stdout, stderr io.Writer) int {
	switch args[0] {
	case "audit":
		return runAudit([]string{"--help"}, stdout, stdout)
	case "export":
		return runExport([]string{"--help"}, stdout, stdout)
	case "preflight":
		return runPreflight([]string{"--help"}, stdout, stdout)
	case "support-bundle":
		return runSupportBundle([]string{"--help"}, stdout, stdout)
	case "web":
		return runWeb([]string{"--help"}, stdout, stdout)
	case "sat":
		return runSAT([]string{"--help"}, stdout, stderr)
	case "benchmark":
		return runBenchmark([]string{"--help"}, stdout, stderr)
	case "version":
		fmt.Fprintln(stdout, "usage: bee version")
		return 0
	default:
		fmt.Fprintf(stderr, "bee help: unknown command %q\n\n", args[0])
		printRootUsage(stderr)
		return 2
	}
}

func runAudit(args []string, stdout, stderr io.Writer) int {
	fs := flag.NewFlagSet("audit", flag.ContinueOnError)
	fs.SetOutput(stderr)
	output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
	runtimeFlag := fs.String("runtime", "auto", "runtime environment: auto, local, livecd")
	showVersion := fs.Bool("version", false, "print version and exit")
	fs.Usage = func() {
		fmt.Fprintln(stderr, "usage: bee audit [--runtime auto|local|livecd] [--output stdout|file:<path>]")
		fs.PrintDefaults()
	}
	if err := fs.Parse(args); err != nil {
		if err == flag.ErrHelp {
			return 0
		}
		return 2
	}
	if fs.NArg() != 0 {
		fs.Usage()
		return 2
	}
	if *showVersion {
		fmt.Fprintln(stdout, Version)
		return 0
	}

	runtimeInfo, err := runtimeenv.Detect(*runtimeFlag)
	if err != nil {
		slog.Error("resolve runtime", "err", err)
		return 1
	}
	slog.Info("runtime resolved", "mode", runtimeInfo.Mode, "reason", runtimeInfo.Reason)

	application := app.New(platform.New())
	path, err := application.RunAudit(runtimeInfo.Mode, *output)
	if err != nil {
		slog.Error("run audit", "err", err)
		return 1
	}
	if path != "stdout" {
		slog.Info("audit output written", "path", path)
	}
	return 0
}

func runExport(args []string, stdout, stderr io.Writer) int {
	fs := flag.NewFlagSet("export", flag.ContinueOnError)
	fs.SetOutput(stderr)
	targetDevice := fs.String("target", "", "removable device path, e.g. /dev/sdb1")
	fs.Usage = func() {
		fmt.Fprintln(stderr, "usage: bee export --target <device>")
		fs.PrintDefaults()
	}
	if err := fs.Parse(args); err != nil {
		if err == flag.ErrHelp {
			return 0
		}
		return 2
	}
	if fs.NArg() != 0 {
		fs.Usage()
		return 2
	}
	if strings.TrimSpace(*targetDevice) == "" {
		fmt.Fprintln(stderr, "bee export: --target is required")
		fs.Usage()
		return 2
	}

	application := app.New(platform.New())
	targets, err := application.ListRemovableTargets()
	if err != nil {
		slog.Error("list removable targets", "err", err)
		return 1
	}

	for _, target := range targets {
		if target.Device == *targetDevice {
			path, err := application.ExportLatestAudit(target)
			if err != nil {
				slog.Error("export latest audit", "err", err)
				return 1
			}
			slog.Info("audit exported", "path", path)
			return 0
		}
	}

	slog.Error("target device not found among removable filesystems", "device", *targetDevice)
	return 1
}

func runPreflight(args []string, stdout, stderr io.Writer) int {
	fs := flag.NewFlagSet("preflight", flag.ContinueOnError)
	fs.SetOutput(stderr)
	output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
	fs.Usage = func() {
		fmt.Fprintf(stderr, "usage: bee preflight [--output stdout|file:%s]\n", app.DefaultRuntimeJSONPath)
		fs.PrintDefaults()
	}
	if err := fs.Parse(args); err != nil {
		if err == flag.ErrHelp {
			return 0
		}
		return 2
	}
	if fs.NArg() != 0 {
		fs.Usage()
		return 2
	}
	application := app.New(platform.New())
	path, err := application.RunRuntimePreflight(*output)
	if err != nil {
		slog.Error("run preflight", "err", err)
		return 1
	}
	if path != "stdout" {
		slog.Info("runtime health written", "path", path)
	}
	return 0
}

func runSupportBundle(args []string, stdout, stderr io.Writer) int {
	fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
	fs.SetOutput(stderr)
	output := fs.String("output", "stdout", "output destination: stdout or file:<path>")
	fs.Usage = func() {
		fmt.Fprintln(stderr, "usage: bee support-bundle [--output stdout|file:<path>]")
		fs.PrintDefaults()
	}
	if err := fs.Parse(args); err != nil {
		if err == flag.ErrHelp {
			return 0
		}
		return 2
	}
	if fs.NArg() != 0 {
		fs.Usage()
		return 2
	}
	path, err := app.BuildSupportBundle(app.DefaultExportDir)
	if err != nil {
		slog.Error("build support bundle", "err", err)
		return 1
	}
	defer os.Remove(path)

	raw, err := os.ReadFile(path)
	if err != nil {
		slog.Error("read support bundle", "err", err)
		return 1
	}
	switch {
	case *output == "stdout":
		if _, err := stdout.Write(raw); err != nil {
			slog.Error("write support bundle stdout", "err", err)
			return 1
		}
	case strings.HasPrefix(*output, "file:"):
		dst := strings.TrimPrefix(*output, "file:")
		if err := os.WriteFile(dst, raw, 0644); err != nil {
			slog.Error("write support bundle", "err", err)
			return 1
		}
		slog.Info("support bundle written", "path", dst)
	default:
		fmt.Fprintln(stderr, "bee support-bundle: unknown output destination")
		fs.Usage()
		return 2
	}
	return 0
}

func runWeb(args []string, stdout, stderr io.Writer) int {
	fs := flag.NewFlagSet("web", flag.ContinueOnError)
	fs.SetOutput(stderr)
	listenAddr := fs.String("listen", ":8080", "listen address, e.g. :80")
	auditPath := fs.String("audit-path", app.DefaultAuditJSONPath, "path to the latest audit JSON snapshot")
	exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
	title := fs.String("title", "Bee Hardware Audit", "page title")
	fs.Usage = func() {
		fmt.Fprintf(stderr, "usage: bee web [--listen :80] [--audit-path %s] [--export-dir %s] [--title \"Bee Hardware Audit\"]\n", app.DefaultAuditJSONPath, app.DefaultExportDir)
		fs.PrintDefaults()
	}
	if err := fs.Parse(args); err != nil {
		if err == flag.ErrHelp {
			return 0
		}
		return 2
	}
	if fs.NArg() != 0 {
		fs.Usage()
		return 2
	}

	slog.Info("starting bee web", "listen", *listenAddr, "audit_path", *auditPath)

	runtimeInfo, err := runtimeenv.Detect("auto")
	if err != nil {
		slog.Warn("resolve runtime for web", "err", err)
	}

	if err := webui.ListenAndServe(*listenAddr, webui.HandlerOptions{
		Title:       *title,
		BuildLabel:  buildLabel(),
		AuditPath:   *auditPath,
		ExportDir:   *exportDir,
		App:         app.New(platform.New()),
		RuntimeMode: runtimeInfo.Mode,
	}); err != nil {
		slog.Error("run web", "err", err)
		return 1
	}
	return 0
}

func runSAT(args []string, stdout, stderr io.Writer) int {
	if len(args) == 0 {
		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
		return 2
	}
	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
		fmt.Fprintln(stdout, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
		return 0
	}

	fs := flag.NewFlagSet("sat", flag.ContinueOnError)
	fs.SetOutput(stderr)
	duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
	diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
	if err := fs.Parse(args[1:]); err != nil {
		if err == flag.ErrHelp {
			return 0
		}
		return 2
	}
	if fs.NArg() != 0 {
		fmt.Fprintf(stderr, "bee sat: unexpected arguments\n")
		return 2
	}

	target := args[0]
	if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
		fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
		fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
		return 2
	}

	application := app.New(platform.New())
	var (
		archive string
		err     error
	)
	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
	switch target {
	case "nvidia":
		level := *diagLevel
		if level > 0 {
			_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
		} else {
			archive, err = application.RunNvidiaAcceptancePack("", logLine)
		}
	case "memory":
		archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
	case "storage":
		archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
	case "cpu":
		dur := *duration
		if dur <= 0 {
			dur = 60
		}
		archive, err = application.RunCPUAcceptancePackCtx(context.Background(), "", dur, logLine)
	}
	if err != nil {
		slog.Error("run sat", "target", target, "err", err)
		return 1
	}
	slog.Info("sat archive written", "target", target, "path", archive)
	return 0
}

func runBenchmark(args []string, stdout, stderr io.Writer) int {
	if len(args) == 0 {
		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
		return 2
	}
	if args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
		fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
		return 0
	}
	target := args[0]
	if target != "nvidia" {
		fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target)
		fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]")
		return 2
	}

	fs := flag.NewFlagSet("benchmark", flag.ContinueOnError)
	fs.SetOutput(stderr)
	profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight")
	devices := fs.String("devices", "", "comma-separated GPU indices to include")
	exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude")
	sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)")
	skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark")
	if err := fs.Parse(args[1:]); err != nil {
		if err == flag.ErrHelp {
			return 0
		}
		return 2
	}
	if fs.NArg() != 0 {
		fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n")
		return 2
	}

	includeIndices, err := parseBenchmarkIndexCSV(*devices)
	if err != nil {
		fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err)
		return 2
	}
	excludeIndices, err := parseBenchmarkIndexCSV(*exclude)
	if err != nil {
		fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err)
		return 2
	}

	application := app.New(platform.New())
	logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
	archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{
		Profile:           *profile,
		SizeMB:            *sizeMB,
		GPUIndices:        includeIndices,
		ExcludeGPUIndices: excludeIndices,
		RunNCCL:           !*skipNCCL,
	}, logLine)
	if err != nil {
		slog.Error("run benchmark", "target", target, "err", err)
		return 1
	}
	slog.Info("benchmark archive written", "target", target, "path", archive)
	return 0
}

func parseBenchmarkIndexCSV(raw string) ([]int, error) {
	raw = strings.TrimSpace(raw)
	if raw == "" {
		return nil, nil
	}
	var indices []int
	for _, part := range strings.Split(raw, ",") {
		part = strings.TrimSpace(part)
		if part == "" {
			continue
		}
		value, err := strconv.Atoi(part)
		if err != nil || value < 0 {
			return nil, fmt.Errorf("bad gpu index %q", part)
		}
		indices = append(indices, value)
	}
	return indices, nil
}