diff --git a/audit/cmd/bee/main.go b/audit/cmd/bee/main.go index 5fb6c1e..9bf7827 100644 --- a/audit/cmd/bee/main.go +++ b/audit/cmd/bee/main.go @@ -8,6 +8,7 @@ import ( "log/slog" "os" "runtime/debug" + "strconv" "strings" "bee/audit/internal/app" @@ -35,15 +36,13 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) { Level: slog.LevelInfo, }))) defer func() { - rec := recover() - if rec == nil { - return + if rec := recover(); rec != nil { + slog.Error("fatal panic", + "panic", fmt.Sprint(rec), + "stack", string(debug.Stack()), + ) + exitCode = 1 } - slog.Error("fatal panic", - "panic", fmt.Sprint(rec), - "stack", string(debug.Stack()), - ) - exitCode = 1 }() if len(args) == 0 { @@ -70,6 +69,8 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) { return runWeb(args[1:], stdout, stderr) case "sat": return runSAT(args[1:], stdout, stderr) + case "benchmark": + return runBenchmark(args[1:], stdout, stderr) case "version", "--version", "-version": fmt.Fprintln(stdout, Version) return 0 @@ -88,6 +89,7 @@ func printRootUsage(w io.Writer) { bee support-bundle --output stdout|file: bee web --listen :80 --audit-path `+app.DefaultAuditJSONPath+` bee sat nvidia|memory|storage|cpu [--duration ] + bee benchmark nvidia [--profile standard|stability|overnight] bee version bee help [command]`) } @@ -106,6 +108,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int { return runWeb([]string{"--help"}, stdout, stdout) case "sat": return runSAT([]string{"--help"}, stdout, stderr) + case "benchmark": + return runBenchmark([]string{"--help"}, stdout, stderr) case "version": fmt.Fprintln(stdout, "usage: bee version") return 0 @@ -395,3 +399,85 @@ func runSAT(args []string, stdout, stderr io.Writer) int { slog.Info("sat archive written", "target", target, "path", archive) return 0 } + +func runBenchmark(args []string, stdout, stderr io.Writer) int { + if len(args) == 0 { + fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]") + return 2 + } + if args[0] == "help" || args[0] == "--help" || args[0] == "-h" { + fmt.Fprintln(stdout, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]") + return 0 + } + target := args[0] + if target != "nvidia" { + fmt.Fprintf(stderr, "bee benchmark: unknown target %q\n", target) + fmt.Fprintln(stderr, "usage: bee benchmark nvidia [--profile standard|stability|overnight] [--devices 0,1] [--exclude 2,3] [--size-mb N] [--skip-nccl]") + return 2 + } + + fs := flag.NewFlagSet("benchmark", flag.ContinueOnError) + fs.SetOutput(stderr) + profile := fs.String("profile", platform.NvidiaBenchmarkProfileStandard, "benchmark profile: standard, stability, overnight") + devices := fs.String("devices", "", "comma-separated GPU indices to include") + exclude := fs.String("exclude", "", "comma-separated GPU indices to exclude") + sizeMB := fs.Int("size-mb", 0, "per-GPU benchmark buffer size in MB (0 = auto)") + skipNCCL := fs.Bool("skip-nccl", false, "skip multi-GPU NCCL interconnect benchmark") + if err := fs.Parse(args[1:]); err != nil { + if err == flag.ErrHelp { + return 0 + } + return 2 + } + if fs.NArg() != 0 { + fmt.Fprintf(stderr, "bee benchmark: unexpected arguments\n") + return 2 + } + + includeIndices, err := parseBenchmarkIndexCSV(*devices) + if err != nil { + fmt.Fprintf(stderr, "bee benchmark: invalid --devices: %v\n", err) + return 2 + } + excludeIndices, err := parseBenchmarkIndexCSV(*exclude) + if err != nil { + fmt.Fprintf(stderr, "bee benchmark: invalid --exclude: %v\n", err) + return 2 + } + + application := app.New(platform.New()) + logLine := func(s string) { fmt.Fprintln(os.Stderr, s) } + archive, err := application.RunNvidiaBenchmark("", platform.NvidiaBenchmarkOptions{ + Profile: *profile, + SizeMB: *sizeMB, + GPUIndices: includeIndices, + ExcludeGPUIndices: excludeIndices, + RunNCCL: !*skipNCCL, + }, logLine) + if err != nil { + slog.Error("run benchmark", "target", target, "err", err) + return 1 + } + slog.Info("benchmark archive written", "target", target, "path", archive) + return 0 +} + +func parseBenchmarkIndexCSV(raw string) ([]int, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil, nil + } + var indices []int + for _, part := range strings.Split(raw, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + value, err := strconv.Atoi(part) + if err != nil || value < 0 { + return nil, fmt.Errorf("bad gpu index %q", part) + } + indices = append(indices, value) + } + return indices, nil +} diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 1a18863..96fd5f7 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -19,17 +19,18 @@ import ( ) var ( - DefaultExportDir = "/appdata/bee/export" - DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json" - DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log" - DefaultWebLogPath = DefaultExportDir + "/bee-web.log" - DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log" - DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log" - DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log" - DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json" - DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" - DefaultTechDumpDir = DefaultExportDir + "/techdump" - DefaultSATBaseDir = DefaultExportDir + "/bee-sat" + DefaultExportDir = "/appdata/bee/export" + DefaultAuditJSONPath = DefaultExportDir + "/bee-audit.json" + DefaultAuditLogPath = DefaultExportDir + "/bee-audit.log" + DefaultWebLogPath = DefaultExportDir + "/bee-web.log" + DefaultNetworkLogPath = DefaultExportDir + "/bee-network.log" + DefaultNvidiaLogPath = DefaultExportDir + "/bee-nvidia.log" + DefaultSSHLogPath = DefaultExportDir + "/bee-sshsetup.log" + DefaultRuntimeJSONPath = DefaultExportDir + "/runtime-health.json" + DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" + DefaultTechDumpDir = DefaultExportDir + "/techdump" + DefaultSATBaseDir = DefaultExportDir + "/bee-sat" + DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark" ) type App struct { @@ -114,6 +115,7 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error { type satRunner interface { RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) + RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) @@ -195,11 +197,10 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro return "stdout", err case strings.HasPrefix(output, "file:"): path := strings.TrimPrefix(output, "file:") - err := atomicWriteFile(path, append(data, '\n'), 0644) - if err == nil { - return path, nil + if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil { + return "", err } - return "", err + return path, nil default: return "", fmt.Errorf("unknown output destination %q — use stdout or file:", output) } @@ -221,11 +222,10 @@ func (a *App) RunRuntimePreflight(output string) (string, error) { return "stdout", err case strings.HasPrefix(output, "file:"): path := strings.TrimPrefix(output, "file:") - err := atomicWriteFile(path, append(data, '\n'), 0644) - if err == nil { - return path, nil + if err := atomicWriteFile(path, append(data, '\n'), 0644); err != nil { + return "", err } - return "", err + return path, nil default: return "", fmt.Errorf("unknown output destination %q — use stdout or file:", output) } @@ -532,6 +532,17 @@ func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOpti return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc) } +func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc) +} + +func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultBenchmarkBaseDir + } + return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) +} + func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 9fa9bc3..6ac733f 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -120,15 +120,16 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus { } type fakeSAT struct { - runNvidiaFn func(string) (string, error) - runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) - runMemoryFn func(string) (string, error) - runStorageFn func(string) (string, error) - runCPUFn func(string, int) (string, error) - detectVendorFn func() string - listAMDGPUsFn func() ([]platform.AMDGPUInfo, error) - runAMDPackFn func(string) (string, error) - listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error) + runNvidiaFn func(string) (string, error) + runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error) + runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) + runMemoryFn func(string) (string, error) + runStorageFn func(string) (string, error) + runCPUFn func(string, int) (string, error) + detectVendorFn func() string + listAMDGPUsFn func() ([]platform.AMDGPUInfo, error) + runAMDPackFn func(string) (string, error) + listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error) } func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) { @@ -139,6 +140,13 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s return f.runNvidiaFn(baseDir) } +func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) { + if f.runNvidiaBenchmarkFn != nil { + return f.runNvidiaBenchmarkFn(baseDir, opts) + } + return f.runNvidiaFn(baseDir) +} + func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) { if f.runNvidiaStressFn != nil { return f.runNvidiaStressFn(baseDir, opts) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go new file mode 100644 index 0000000..1b401ee --- /dev/null +++ b/audit/internal/platform/benchmark.go @@ -0,0 +1,1009 @@ +package platform + +import ( + "context" + "encoding/csv" + "encoding/json" + "fmt" + "math" + "os" + "path/filepath" + "regexp" + "sort" + "strconv" + "strings" + "time" +) + +const benchmarkVersion = "1" + +type benchmarkProfileSpec struct { + Name string + BaselineSec int + WarmupSec int + SteadySec int + NCCLSec int + CooldownSec int +} + +type benchmarkGPUInfo struct { + Index int + UUID string + Name string + BusID string + VBIOS string + PowerLimitW float64 + MaxGraphicsClockMHz float64 + MaxMemoryClockMHz float64 +} + +type benchmarkBurnProfile struct { + name string + category string + supported bool + lanes int + m uint64 + n uint64 + k uint64 + iterations uint64 + notes string +} + +type benchmarkBurnParseResult struct { + Device string + ComputeCapability string + Backend string + DurationSec int + Profiles []BenchmarkPrecisionResult + Fallback bool +} + +type benchmarkRestoreAction struct { + name string + fn func() +} + +var ( + benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`) + benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`) + benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`) +) + +func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + if ctx == nil { + ctx = context.Background() + } + if logFunc == nil { + logFunc = func(string) {} + } + if strings.TrimSpace(baseDir) == "" { + baseDir = "/var/log/bee-benchmark" + } + spec := resolveBenchmarkProfile(opts.Profile) + opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) + + selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) + if err != nil { + return "", err + } + if len(selected) == 0 { + return "", fmt.Errorf("no NVIDIA GPUs selected") + } + + ts := time.Now().UTC().Format("20060102-150405") + runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts) + if err := os.MkdirAll(runDir, 0755); err != nil { + return "", fmt.Errorf("mkdir %s: %w", runDir, err) + } + verboseLog := filepath.Join(runDir, "verbose.log") + + hostname, _ := os.Hostname() + result := NvidiaBenchmarkResult{ + BenchmarkVersion: benchmarkVersion, + GeneratedAt: time.Now().UTC(), + Hostname: hostname, + BenchmarkProfile: spec.Name, + SelectedGPUIndices: append([]int(nil), selected...), + Normalization: BenchmarkNormalization{ + Status: "full", + }, + } + + logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) + + infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) + if infoErr != nil { + result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) + result.Normalization.Status = "partial" + } + + if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { + _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) + } + + activeApps, err := queryActiveComputeApps(selected) + if err == nil && len(activeApps) > 0 { + result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark") + result.Normalization.Notes = append(result.Normalization.Notes, activeApps...) + result.Normalization.Status = "partial" + } + + restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result) + defer func() { + for i := len(restoreActions) - 1; i >= 0; i-- { + restoreActions[i].fn() + } + }() + + for _, idx := range selected { + gpuResult := BenchmarkGPUResult{ + Index: idx, + Status: "FAILED", + } + if info, ok := infoByIndex[idx]; ok { + gpuResult.UUID = info.UUID + gpuResult.Name = info.Name + gpuResult.BusID = info.BusID + gpuResult.VBIOS = info.VBIOS + gpuResult.PowerLimitW = info.PowerLimitW + gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz + gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz + } + if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { + gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz + gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz + } + + baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx}) + if err != nil && err != context.Canceled { + gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) + } + gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) + writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows) + + warmupCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(spec.WarmupSec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", strconv.Itoa(idx), + } + logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) + warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc) + _ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644) + if warmupErr != nil { + gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) + result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) + continue + } + + beforeThrottle, _ := queryThrottleCounters(idx) + steadyCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(spec.SteadySec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", strconv.Itoa(idx), + } + logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec)) + steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc) + _ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644) + afterThrottle, _ := queryThrottleCounters(idx) + if steadyErr != nil { + gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error()) + } + + parseResult := parseBenchmarkBurnLog(string(steadyOut)) + gpuResult.ComputeCapability = parseResult.ComputeCapability + gpuResult.Backend = parseResult.Backend + gpuResult.PrecisionResults = parseResult.Profiles + if parseResult.Fallback { + gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") + } + + gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows) + gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle) + + cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) + if err != nil && err != context.Canceled { + gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) + } + gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) + writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows) + + gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) + gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) + if steadyErr != nil { + gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr) + } else if parseResult.Fallback { + gpuResult.Status = "PARTIAL" + } else { + gpuResult.Status = "OK" + } + + result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) + } + + if len(selected) > 1 && opts.RunNCCL { + result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc) + if result.Interconnect != nil && result.Interconnect.Supported { + for i := range result.GPUs { + result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps + result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores) + } + } + } + + result.Findings = buildBenchmarkFindings(result) + result.OverallStatus = benchmarkOverallStatus(result) + + resultJSON, err := json.MarshalIndent(result, "", " ") + if err != nil { + return "", fmt.Errorf("marshal benchmark result: %w", err) + } + if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { + return "", fmt.Errorf("write result.json: %w", err) + } + + report := renderBenchmarkReport(result) + if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil { + return "", fmt.Errorf("write report.txt: %w", err) + } + + summary := renderBenchmarkSummary(result) + if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil { + return "", fmt.Errorf("write summary.txt: %w", err) + } + + archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz") + if err := createTarGz(archive, runDir); err != nil { + return "", fmt.Errorf("pack benchmark archive: %w", err) + } + return archive, nil +} + +func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions { + switch strings.TrimSpace(strings.ToLower(opts.Profile)) { + case NvidiaBenchmarkProfileStability: + opts.Profile = NvidiaBenchmarkProfileStability + case NvidiaBenchmarkProfileOvernight: + opts.Profile = NvidiaBenchmarkProfileOvernight + default: + opts.Profile = NvidiaBenchmarkProfileStandard + } + if opts.SizeMB < 0 { + opts.SizeMB = 0 + } + opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices) + opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices) + if !opts.RunNCCL { + opts.RunNCCL = true + } + return opts +} + +func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { + switch strings.TrimSpace(strings.ToLower(profile)) { + case NvidiaBenchmarkProfileStability: + return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300} + case NvidiaBenchmarkProfileOvernight: + return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300} + default: + return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120} + } +} + +func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { + args := []string{ + "--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory", + "--format=csv,noheader,nounits", + } + if len(gpuIndices) > 0 { + args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) + } + out, err := satExecCommand("nvidia-smi", args...).Output() + if err != nil { + return nil, fmt.Errorf("nvidia-smi gpu info: %w", err) + } + + r := csv.NewReader(strings.NewReader(string(out))) + r.TrimLeadingSpace = true + r.FieldsPerRecord = -1 + rows, err := r.ReadAll() + if err != nil { + return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err) + } + + infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) + for _, row := range rows { + if len(row) < 8 { + continue + } + idx, err := strconv.Atoi(strings.TrimSpace(row[0])) + if err != nil { + continue + } + infoByIndex[idx] = benchmarkGPUInfo{ + Index: idx, + UUID: strings.TrimSpace(row[1]), + Name: strings.TrimSpace(row[2]), + BusID: strings.TrimSpace(row[3]), + VBIOS: strings.TrimSpace(row[4]), + PowerLimitW: parseBenchmarkFloat(row[5]), + MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]), + MaxMemoryClockMHz: parseBenchmarkFloat(row[7]), + } + } + return infoByIndex, nil +} + +func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { + if os.Geteuid() != 0 { + result.Normalization.Status = "partial" + result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks") + for _, idx := range gpuIndices { + result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{ + Index: idx, + Notes: []string{"normalization skipped: root privileges are required"}, + }) + } + return nil + } + + var restore []benchmarkRestoreAction + for _, idx := range gpuIndices { + rec := BenchmarkNormalizationGPU{Index: idx} + if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil { + rec.PersistenceMode = "failed" + rec.Notes = append(rec.Notes, "failed to enable persistence mode") + result.Normalization.Status = "partial" + } else { + rec.PersistenceMode = "applied" + } + + if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 { + target := int(math.Round(info.MaxGraphicsClockMHz)) + if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil { + rec.GPUClockLockStatus = "failed" + rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out))) + result.Normalization.Status = "partial" + } else { + rec.GPUClockLockStatus = "applied" + rec.GPUClockLockMHz = float64(target) + idxCopy := idx + restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() { + _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil) + }}) + } + } + + if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 { + target := int(math.Round(info.MaxMemoryClockMHz)) + out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil) + switch { + case err == nil: + rec.MemoryClockLockStatus = "applied" + rec.MemoryClockLockMHz = float64(target) + idxCopy := idx + restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() { + _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil) + }}) + case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"): + rec.MemoryClockLockStatus = "unsupported" + rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path") + result.Normalization.Status = "partial" + default: + rec.MemoryClockLockStatus = "failed" + rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out))) + result.Normalization.Status = "partial" + } + } + + result.Normalization.GPUs = append(result.Normalization.GPUs, rec) + } + return restore +} + +func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) { + if durationSec <= 0 { + return nil, nil + } + deadline := time.Now().Add(time.Duration(durationSec) * time.Second) + var rows []GPUMetricRow + start := time.Now() + for { + if ctx.Err() != nil { + return rows, ctx.Err() + } + samples, err := sampleGPUMetrics(gpuIndices) + if err == nil { + elapsed := time.Since(start).Seconds() + for i := range samples { + samples[i].ElapsedSec = elapsed + } + rows = append(rows, samples...) + } + if time.Now().After(deadline) { + break + } + select { + case <-ctx.Done(): + return rows, ctx.Err() + case <-time.After(time.Second): + } + } + return rows, nil +} + +func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) { + stopCh := make(chan struct{}) + doneCh := make(chan struct{}) + var metricRows []GPUMetricRow + start := time.Now() + + go func() { + defer close(doneCh) + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + for { + select { + case <-stopCh: + return + case <-ticker.C: + samples, err := sampleGPUMetrics(gpuIndices) + if err != nil { + continue + } + elapsed := time.Since(start).Seconds() + for i := range samples { + samples[i].ElapsedSec = elapsed + } + metricRows = append(metricRows, samples...) + } + } + }() + + out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc) + close(stopCh) + <-doneCh + + writeBenchmarkMetricsFiles(runDir, baseName, metricRows) + return out, metricRows, err +} + +func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) { + if len(rows) == 0 { + return + } + _ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows) + _ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows) + chart := RenderGPUTerminalChart(rows) + _ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644) +} + +func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult { + result := benchmarkBurnParseResult{} + lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") + profiles := make(map[string]*benchmarkBurnProfile) + for _, line := range lines { + line = stripBenchmarkPrefix(strings.TrimSpace(line)) + if line == "" { + continue + } + switch { + case strings.HasPrefix(line, "device="): + result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device=")) + case strings.HasPrefix(line, "compute_capability="): + result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability=")) + case strings.HasPrefix(line, "backend="): + result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend=")) + result.Fallback = result.Backend == "driver-ptx" + case strings.HasPrefix(line, "duration_s="): + result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s="))) + default: + if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 { + profile := ensureBenchmarkProfile(profiles, m[1]) + profile.supported = true + profile.lanes++ + profile.m, _ = strconv.ParseUint(m[3], 10, 64) + profile.n, _ = strconv.ParseUint(m[4], 10, 64) + profile.k, _ = strconv.ParseUint(m[5], 10, 64) + continue + } + if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 { + profile := ensureBenchmarkProfile(profiles, m[1]) + profile.supported = false + profile.notes = strings.TrimSpace(m[2]) + continue + } + if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 { + profile := ensureBenchmarkProfile(profiles, m[1]) + iters, _ := strconv.ParseUint(m[2], 10, 64) + profile.iterations += iters + } + } + } + + keys := make([]string, 0, len(profiles)) + for key := range profiles { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + profile := profiles[key] + precision := BenchmarkPrecisionResult{ + Name: profile.name, + Category: profile.category, + Supported: profile.supported, + Lanes: profile.lanes, + M: profile.m, + N: profile.n, + K: profile.k, + Iterations: profile.iterations, + Notes: profile.notes, + } + if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 { + precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12 + } + result.Profiles = append(result.Profiles, precision) + } + return result +} + +func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile { + if profile, ok := profiles[name]; ok { + return profile + } + category := "other" + switch { + case strings.HasPrefix(name, "fp32"): + category = "fp32_tf32" + case strings.HasPrefix(name, "fp16"): + category = "fp16_bf16" + case strings.HasPrefix(name, "fp8"): + category = "fp8" + case strings.HasPrefix(name, "fp4"): + category = "fp4" + } + profile := &benchmarkBurnProfile{name: name, category: category, supported: true} + profiles[name] = profile + return profile +} + +func stripBenchmarkPrefix(line string) string { + if strings.HasPrefix(line, "[gpu ") { + if idx := strings.Index(line, "] "); idx >= 0 { + return line[idx+2:] + } + } + return line +} + +func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary { + summary := BenchmarkTelemetrySummary{} + if len(rows) == 0 { + return summary + } + temps := make([]float64, 0, len(rows)) + powers := make([]float64, 0, len(rows)) + clocks := make([]float64, 0, len(rows)) + memClocks := make([]float64, 0, len(rows)) + usages := make([]float64, 0, len(rows)) + memUsages := make([]float64, 0, len(rows)) + summary.DurationSec = rows[len(rows)-1].ElapsedSec + summary.Samples = len(rows) + for _, row := range rows { + temps = append(temps, row.TempC) + powers = append(powers, row.PowerW) + clocks = append(clocks, row.ClockMHz) + memClocks = append(memClocks, row.MemClockMHz) + usages = append(usages, row.UsagePct) + memUsages = append(memUsages, row.MemUsagePct) + } + summary.AvgTempC = benchmarkMean(temps) + summary.P95TempC = benchmarkPercentile(temps, 95) + summary.AvgPowerW = benchmarkMean(powers) + summary.P95PowerW = benchmarkPercentile(powers, 95) + summary.AvgGraphicsClockMHz = benchmarkMean(clocks) + summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95) + summary.AvgMemoryClockMHz = benchmarkMean(memClocks) + summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95) + summary.AvgUsagePct = benchmarkMean(usages) + summary.AvgMemUsagePct = benchmarkMean(memUsages) + summary.ClockCVPct = benchmarkCV(clocks) + summary.PowerCVPct = benchmarkCV(powers) + summary.TempCVPct = benchmarkCV(temps) + summary.ClockDriftPct = benchmarkClockDrift(clocks) + return summary +} + +func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { + score := BenchmarkScorecard{} + for _, precision := range gpu.PrecisionResults { + if precision.Supported { + score.ComputeScore += precision.TeraOpsPerSec + } + } + if gpu.PowerLimitW > 0 { + score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100) + } + runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) + thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS + score.ThermalSustainScore = clampScore(100 - thermalRatio*100) + score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2)) + score.CompositeScore = compositeBenchmarkScore(score) + return score +} + +func compositeBenchmarkScore(score BenchmarkScorecard) float64 { + quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0) + if score.InterconnectScore > 0 { + quality += 0.10 + } + if quality > 1.10 { + quality = 1.10 + } + return score.ComputeScore * quality +} + +func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { + var reasons []string + runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) + if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 { + reasons = append(reasons, "power_capped") + } + if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 { + reasons = append(reasons, "thermal_limited") + } + if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 { + reasons = append(reasons, "sync_boost_limited") + } + if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 { + reasons = append(reasons, "low_sm_clock_vs_target") + } + if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 { + reasons = append(reasons, "variance_too_high") + } + if normalizationStatus != "full" { + reasons = append(reasons, "normalization_partial") + } + return dedupeStrings(reasons) +} + +func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult { + result := &BenchmarkInterconnectResult{ + Status: "UNSUPPORTED", + Attempted: true, + SelectedGPUIndices: append([]int(nil), gpuIndices...), + } + cmd := []string{ + "all_reduce_perf", + "-b", "512M", + "-e", "4G", + "-f", "2", + "-g", strconv.Itoa(len(gpuIndices)), + "--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)), + } + env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)} + logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices))) + out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc) + _ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644) + if err != nil { + result.Notes = append(result.Notes, strings.TrimSpace(string(out))) + return result + } + avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out)) + result.Status = "OK" + result.Supported = true + result.AvgAlgBWGBps = avgAlg + result.MaxAlgBWGBps = maxAlg + result.AvgBusBWGBps = avgBus + result.MaxBusBWGBps = maxBus + return result +} + +func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) { + lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") + var algs []float64 + var buses []float64 + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + fields := strings.Fields(line) + if len(fields) < 8 { + continue + } + for i := 0; i+2 < len(fields); i++ { + timeVal, err1 := strconv.ParseFloat(fields[i], 64) + algVal, err2 := strconv.ParseFloat(fields[i+1], 64) + busVal, err3 := strconv.ParseFloat(fields[i+2], 64) + if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 { + algs = append(algs, algVal) + buses = append(buses, busVal) + break + } + } + } + if len(algs) == 0 { + return 0, 0, 0, 0 + } + return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses) +} + +func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) { + out, err := satExecCommand( + "nvidia-smi", + "--id="+strconv.Itoa(gpuIndex), + "--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown", + "--format=csv,noheader,nounits", + ).Output() + if err != nil { + return BenchmarkThrottleCounters{}, err + } + fields := strings.Split(strings.TrimSpace(string(out)), ",") + if len(fields) < 5 { + return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out))) + } + return BenchmarkThrottleCounters{ + SWPowerCapUS: parseBenchmarkUint64(fields[0]), + SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]), + SyncBoostUS: parseBenchmarkUint64(fields[2]), + HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]), + HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]), + }, nil +} + +func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters { + return BenchmarkThrottleCounters{ + SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS), + SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS), + SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS), + HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS), + HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS), + } +} + +func queryActiveComputeApps(gpuIndices []int) ([]string, error) { + args := []string{ + "--query-compute-apps=gpu_uuid,pid,process_name", + "--format=csv,noheader,nounits", + } + if len(gpuIndices) > 0 { + args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) + } + out, err := satExecCommand("nvidia-smi", args...).Output() + if err != nil { + return nil, err + } + var lines []string + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + lines = append(lines, line) + } + return lines, nil +} + +func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult { + if gpu.Status == "" { + gpu.Status = "OK" + } + if gpu.Scores.CompositeScore == 0 { + gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores) + } + return gpu +} + +func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { + var findings []string + if result.Normalization.Status != "full" { + findings = append(findings, "Environment normalization was partial; compare results with caution.") + } + for _, gpu := range result.GPUs { + if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" { + findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index)) + continue + } + for _, reason := range gpu.DegradationReasons { + switch reason { + case "power_capped": + findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index)) + case "thermal_limited": + findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)) + case "sync_boost_limited": + findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index)) + case "low_sm_clock_vs_target": + findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index)) + case "variance_too_high": + findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index)) + case "normalization_partial": + findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index)) + } + } + if gpu.Backend == "driver-ptx" { + findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index)) + } + } + if result.Interconnect != nil && result.Interconnect.Supported { + findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps)) + } + return dedupeStrings(findings) +} + +func benchmarkOverallStatus(result NvidiaBenchmarkResult) string { + if len(result.GPUs) == 0 { + return "FAILED" + } + hasOK := false + hasPartial := result.Normalization.Status != "full" + for _, gpu := range result.GPUs { + switch gpu.Status { + case "OK": + hasOK = true + case "PARTIAL", "UNSUPPORTED": + hasPartial = true + } + } + if !hasOK { + return "FAILED" + } + if hasPartial { + return "PARTIAL" + } + return "OK" +} + +func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU { + for i := range items { + if items[i].Index == idx { + return &items[i] + } + } + return nil +} + +func classifySATErrorStatus(out []byte, err error) string { + status, _ := classifySATResult("benchmark", out, err) + if status == "UNSUPPORTED" { + return "UNSUPPORTED" + } + return "FAILED" +} + +func parseBenchmarkFloat(raw string) float64 { + raw = strings.TrimSpace(raw) + if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { + return 0 + } + value, _ := strconv.ParseFloat(raw, 64) + return value +} + +func parseBenchmarkUint64(raw string) uint64 { + raw = strings.TrimSpace(raw) + if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { + return 0 + } + value, _ := strconv.ParseUint(raw, 10, 64) + return value +} + +func benchmarkMean(values []float64) float64 { + if len(values) == 0 { + return 0 + } + var sum float64 + for _, value := range values { + sum += value + } + return sum / float64(len(values)) +} + +func benchmarkPercentile(values []float64, p float64) float64 { + if len(values) == 0 { + return 0 + } + copyValues := append([]float64(nil), values...) + sort.Float64s(copyValues) + if len(copyValues) == 1 { + return copyValues[0] + } + rank := (p / 100.0) * float64(len(copyValues)-1) + lower := int(math.Floor(rank)) + upper := int(math.Ceil(rank)) + if lower == upper { + return copyValues[lower] + } + frac := rank - float64(lower) + return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac +} + +func benchmarkCV(values []float64) float64 { + if len(values) == 0 { + return 0 + } + mean := benchmarkMean(values) + if mean == 0 { + return 0 + } + var variance float64 + for _, value := range values { + diff := value - mean + variance += diff * diff + } + variance /= float64(len(values)) + return math.Sqrt(variance) / mean * 100 +} + +func benchmarkClockDrift(values []float64) float64 { + if len(values) < 4 { + return 0 + } + window := len(values) / 4 + if window < 1 { + window = 1 + } + head := benchmarkMean(values[:window]) + tail := benchmarkMean(values[len(values)-window:]) + if head <= 0 || tail >= head { + return 0 + } + return ((head - tail) / head) * 100 +} + +func benchmarkMax(values []float64) float64 { + var max float64 + for i, value := range values { + if i == 0 || value > max { + max = value + } + } + return max +} + +func clampScore(value float64) float64 { + switch { + case value < 0: + return 0 + case value > 100: + return 100 + default: + return value + } +} + +func dedupeStrings(values []string) []string { + if len(values) == 0 { + return nil + } + seen := make(map[string]struct{}, len(values)) + out := make([]string, 0, len(values)) + for _, value := range values { + value = strings.TrimSpace(value) + if value == "" { + continue + } + if _, ok := seen[value]; ok { + continue + } + seen[value] = struct{}{} + out = append(out, value) + } + return out +} + +func saturatingSub(after, before uint64) uint64 { + if after <= before { + return 0 + } + return after - before +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go new file mode 100644 index 0000000..13a3dcf --- /dev/null +++ b/audit/internal/platform/benchmark_report.go @@ -0,0 +1,141 @@ +package platform + +import ( + "fmt" + "strings" + "time" +) + +func renderBenchmarkReport(result NvidiaBenchmarkResult) string { + var b strings.Builder + fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n") + fmt.Fprintf(&b, "===========================\n\n") + fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) + fmt.Fprintf(&b, "Host: %s\n", result.Hostname) + fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile) + fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus) + fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices)) + fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status) + + if len(result.Findings) > 0 { + fmt.Fprintf(&b, "Executive Summary\n") + fmt.Fprintf(&b, "-----------------\n") + for _, finding := range result.Findings { + fmt.Fprintf(&b, "- %s\n", finding) + } + b.WriteString("\n") + } + + if len(result.Warnings) > 0 { + fmt.Fprintf(&b, "Warnings\n") + fmt.Fprintf(&b, "--------\n") + for _, warning := range result.Warnings { + fmt.Fprintf(&b, "- %s\n", warning) + } + b.WriteString("\n") + } + + fmt.Fprintf(&b, "Per GPU Scorecard\n") + fmt.Fprintf(&b, "-----------------\n") + for _, gpu := range result.GPUs { + fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name) + fmt.Fprintf(&b, " Status: %s\n", gpu.Status) + fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore) + fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore) + fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore) + fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore) + fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore) + if gpu.Scores.InterconnectScore > 0 { + fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore) + } + if len(gpu.DegradationReasons) > 0 { + fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", ")) + } + fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz) + fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz) + if len(gpu.PrecisionResults) > 0 { + fmt.Fprintf(&b, " Precision results:\n") + for _, precision := range gpu.PrecisionResults { + if precision.Supported { + fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations) + } else { + fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes) + } + } + } + fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n", + gpu.Throttle.SWPowerCapUS, + gpu.Throttle.SWThermalSlowdownUS, + gpu.Throttle.SyncBoostUS, + gpu.Throttle.HWThermalSlowdownUS, + gpu.Throttle.HWPowerBrakeSlowdownUS, + ) + if len(gpu.Notes) > 0 { + fmt.Fprintf(&b, " Notes:\n") + for _, note := range gpu.Notes { + fmt.Fprintf(&b, " - %s\n", note) + } + } + b.WriteString("\n") + } + + if result.Interconnect != nil { + fmt.Fprintf(&b, "Interconnect\n") + fmt.Fprintf(&b, "------------\n") + fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status) + if result.Interconnect.Supported { + fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps) + fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps) + } + for _, note := range result.Interconnect.Notes { + fmt.Fprintf(&b, "- %s\n", note) + } + b.WriteString("\n") + } + + fmt.Fprintf(&b, "Methodology\n") + fmt.Fprintf(&b, "-----------\n") + fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile) + fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n") + fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n") + fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n") + + fmt.Fprintf(&b, "Raw Files\n") + fmt.Fprintf(&b, "---------\n") + fmt.Fprintf(&b, "- result.json\n") + fmt.Fprintf(&b, "- report.txt\n") + fmt.Fprintf(&b, "- summary.txt\n") + fmt.Fprintf(&b, "- verbose.log\n") + fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n") + fmt.Fprintf(&b, "- gpu-*-warmup.log\n") + fmt.Fprintf(&b, "- gpu-*-steady.log\n") + fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n") + fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n") + if result.Interconnect != nil { + fmt.Fprintf(&b, "- nccl-all-reduce.log\n") + } + return b.String() +} + +func renderBenchmarkSummary(result NvidiaBenchmarkResult) string { + var b strings.Builder + fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339)) + fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile) + fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus) + fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs)) + fmt.Fprintf(&b, "normalization_status=%s\n", result.Normalization.Status) + var best float64 + for i, gpu := range result.GPUs { + fmt.Fprintf(&b, "gpu_%d_status=%s\n", gpu.Index, gpu.Status) + fmt.Fprintf(&b, "gpu_%d_composite_score=%.2f\n", gpu.Index, gpu.Scores.CompositeScore) + if i == 0 || gpu.Scores.CompositeScore > best { + best = gpu.Scores.CompositeScore + } + } + fmt.Fprintf(&b, "best_composite_score=%.2f\n", best) + if result.Interconnect != nil { + fmt.Fprintf(&b, "interconnect_status=%s\n", result.Interconnect.Status) + fmt.Fprintf(&b, "interconnect_max_busbw_gbps=%.1f\n", result.Interconnect.MaxBusBWGBps) + } + return b.String() +} diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go new file mode 100644 index 0000000..51120e7 --- /dev/null +++ b/audit/internal/platform/benchmark_test.go @@ -0,0 +1,132 @@ +package platform + +import ( + "strings" + "testing" +) + +func TestResolveBenchmarkProfile(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + profile string + want benchmarkProfileSpec + }{ + { + name: "default", + profile: "", + want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}, + }, + { + name: "stability", + profile: "stability", + want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}, + }, + { + name: "overnight", + profile: "overnight", + want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}, + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + got := resolveBenchmarkProfile(tc.profile) + if got != tc.want { + t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want) + } + }) + } +} + +func TestParseBenchmarkBurnLog(t *testing.T) { + t.Parallel() + + raw := strings.Join([]string{ + "loader=bee-gpu-burn", + "[gpu 0] device=NVIDIA H100", + "[gpu 0] compute_capability=9.0", + "[gpu 0] backend=cublasLt", + "[gpu 0] duration_s=10", + "[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0", + "[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0", + "[gpu 0] fp16_tensor_iterations=200", + "[gpu 0] fp8_e4m3_iterations=50", + "[gpu 0] status=OK", + }, "\n") + + got := parseBenchmarkBurnLog(raw) + if got.Backend != "cublasLt" { + t.Fatalf("backend=%q want cublasLt", got.Backend) + } + if got.ComputeCapability != "9.0" { + t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability) + } + if len(got.Profiles) != 2 { + t.Fatalf("profiles=%d want 2", len(got.Profiles)) + } + if got.Profiles[0].TeraOpsPerSec <= 0 { + t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec) + } + if got.Profiles[1].Category != "fp8" { + t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category) + } +} + +func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) { + t.Parallel() + + result := NvidiaBenchmarkResult{ + BenchmarkVersion: benchmarkVersion, + BenchmarkProfile: NvidiaBenchmarkProfileStandard, + OverallStatus: "PARTIAL", + SelectedGPUIndices: []int{0}, + Normalization: BenchmarkNormalization{ + Status: "partial", + }, + Findings: []string{"GPU 0 spent measurable time under SW power cap."}, + GPUs: []BenchmarkGPUResult{ + { + Index: 0, + Name: "NVIDIA H100", + Status: "OK", + Steady: BenchmarkTelemetrySummary{ + AvgPowerW: 680, + AvgTempC: 79, + AvgGraphicsClockMHz: 1725, + P95PowerW: 700, + P95TempC: 82, + P95GraphicsClockMHz: 1800, + }, + Scores: BenchmarkScorecard{ + ComputeScore: 1200, + PowerSustainScore: 96, + ThermalSustainScore: 88, + StabilityScore: 92, + CompositeScore: 1176, + }, + PrecisionResults: []BenchmarkPrecisionResult{ + {Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700}, + }, + Throttle: BenchmarkThrottleCounters{ + SWPowerCapUS: 1000000, + }, + DegradationReasons: []string{"power_capped"}, + }, + }, + } + + report := renderBenchmarkReport(result) + for _, needle := range []string{ + "Executive Summary", + "GPU 0 spent measurable time under SW power cap.", + "Composite score: 1176.00", + "fp16_tensor: 700.00 TOPS", + } { + if !strings.Contains(report, needle) { + t.Fatalf("report missing %q\n%s", needle, report) + } + } +} diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go new file mode 100644 index 0000000..a8b5618 --- /dev/null +++ b/audit/internal/platform/benchmark_types.go @@ -0,0 +1,132 @@ +package platform + +import "time" + +const ( + NvidiaBenchmarkProfileStandard = "standard" + NvidiaBenchmarkProfileStability = "stability" + NvidiaBenchmarkProfileOvernight = "overnight" +) + +type NvidiaBenchmarkOptions struct { + Profile string + SizeMB int + GPUIndices []int + ExcludeGPUIndices []int + RunNCCL bool +} + +type NvidiaBenchmarkResult struct { + BenchmarkVersion string `json:"benchmark_version"` + GeneratedAt time.Time `json:"generated_at"` + Hostname string `json:"hostname,omitempty"` + BenchmarkProfile string `json:"benchmark_profile"` + OverallStatus string `json:"overall_status"` + SelectedGPUIndices []int `json:"selected_gpu_indices"` + Findings []string `json:"findings,omitempty"` + Warnings []string `json:"warnings,omitempty"` + Normalization BenchmarkNormalization `json:"normalization"` + GPUs []BenchmarkGPUResult `json:"gpus"` + Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` +} + +type BenchmarkNormalization struct { + Status string `json:"status"` + Notes []string `json:"notes,omitempty"` + GPUs []BenchmarkNormalizationGPU `json:"gpus,omitempty"` +} + +type BenchmarkNormalizationGPU struct { + Index int `json:"index"` + PersistenceMode string `json:"persistence_mode,omitempty"` + GPUClockLockMHz float64 `json:"gpu_clock_lock_mhz,omitempty"` + GPUClockLockStatus string `json:"gpu_clock_lock_status,omitempty"` + MemoryClockLockMHz float64 `json:"memory_clock_lock_mhz,omitempty"` + MemoryClockLockStatus string `json:"memory_clock_lock_status,omitempty"` + Notes []string `json:"notes,omitempty"` +} + +type BenchmarkGPUResult struct { + Index int `json:"index"` + UUID string `json:"uuid,omitempty"` + Name string `json:"name,omitempty"` + BusID string `json:"bus_id,omitempty"` + VBIOS string `json:"vbios,omitempty"` + ComputeCapability string `json:"compute_capability,omitempty"` + Backend string `json:"backend,omitempty"` + Status string `json:"status"` + PowerLimitW float64 `json:"power_limit_w,omitempty"` + MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` + MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` + LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"` + LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"` + Baseline BenchmarkTelemetrySummary `json:"baseline"` + Steady BenchmarkTelemetrySummary `json:"steady"` + Cooldown BenchmarkTelemetrySummary `json:"cooldown"` + Throttle BenchmarkThrottleCounters `json:"throttle_counters"` + PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"` + Scores BenchmarkScorecard `json:"scores"` + DegradationReasons []string `json:"degradation_reasons,omitempty"` + Notes []string `json:"notes,omitempty"` +} + +type BenchmarkTelemetrySummary struct { + DurationSec float64 `json:"duration_sec"` + Samples int `json:"samples"` + AvgTempC float64 `json:"avg_temp_c"` + P95TempC float64 `json:"p95_temp_c"` + AvgPowerW float64 `json:"avg_power_w"` + P95PowerW float64 `json:"p95_power_w"` + AvgGraphicsClockMHz float64 `json:"avg_graphics_clock_mhz"` + P95GraphicsClockMHz float64 `json:"p95_graphics_clock_mhz"` + AvgMemoryClockMHz float64 `json:"avg_memory_clock_mhz"` + P95MemoryClockMHz float64 `json:"p95_memory_clock_mhz"` + AvgUsagePct float64 `json:"avg_usage_pct"` + AvgMemUsagePct float64 `json:"avg_mem_usage_pct"` + ClockCVPct float64 `json:"clock_cv_pct"` + PowerCVPct float64 `json:"power_cv_pct"` + TempCVPct float64 `json:"temp_cv_pct"` + ClockDriftPct float64 `json:"clock_drift_pct"` +} + +type BenchmarkThrottleCounters struct { + SWPowerCapUS uint64 `json:"sw_power_cap_us"` + SWThermalSlowdownUS uint64 `json:"sw_thermal_slowdown_us"` + SyncBoostUS uint64 `json:"sync_boost_us"` + HWThermalSlowdownUS uint64 `json:"hw_thermal_slowdown_us"` + HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"` +} + +type BenchmarkPrecisionResult struct { + Name string `json:"name"` + Category string `json:"category"` + Supported bool `json:"supported"` + Lanes int `json:"lanes,omitempty"` + M uint64 `json:"m,omitempty"` + N uint64 `json:"n,omitempty"` + K uint64 `json:"k,omitempty"` + Iterations uint64 `json:"iterations,omitempty"` + TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"` + Notes string `json:"notes,omitempty"` +} + +type BenchmarkScorecard struct { + ComputeScore float64 `json:"compute_score"` + PowerSustainScore float64 `json:"power_sustain_score"` + ThermalSustainScore float64 `json:"thermal_sustain_score"` + StabilityScore float64 `json:"stability_score"` + InterconnectScore float64 `json:"interconnect_score"` + CompositeScore float64 `json:"composite_score"` +} + +type BenchmarkInterconnectResult struct { + Status string `json:"status"` + Attempted bool `json:"attempted"` + Supported bool `json:"supported"` + SelectedGPUIndices []int `json:"selected_gpu_indices,omitempty"` + AvgAlgBWGBps float64 `json:"avg_algbw_gbps,omitempty"` + MaxAlgBWGBps float64 `json:"max_algbw_gbps,omitempty"` + AvgBusBWGBps float64 `json:"avg_busbw_gbps,omitempty"` + MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"` + Notes []string `json:"notes,omitempty"` +} diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index 0873875..be055d5 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -141,10 +141,10 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) { // WriteGPUMetricsCSV writes collected rows as a CSV file. func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { var b bytes.Buffer - b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,power_w,clock_mhz,mem_clock_mhz\n") + b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n") for _, r := range rows { - fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.0f,%.0f\n", - r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz) + fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n", + r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz) } return os.WriteFile(path, b.Bytes(), 0644) } diff --git a/bible b/bible index 688b87e..1d89a49 160000 --- a/bible +++ b/bible @@ -1 +1 @@ -Subproject commit 688b87e98deed5fadd71e10e123073640d92c15a +Subproject commit 1d89a4918e6d4b42847e6dbeccbcc40b091a8369