diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index b17eda6..a66979f 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -124,6 +124,7 @@ type satRunner interface { RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) + RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) @@ -574,6 +575,13 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) } +func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultBeeBenchPowerDir + } + return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc) +} + func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) { if strings.TrimSpace(baseDir) == "" { baseDir = DefaultSATBaseDir diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index 49b4889..57c3385 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus { type fakeSAT struct { runNvidiaFn func(string) (string, error) runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error) + runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error) runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error) runNvidiaComputeFn func(string, int, []int) (string, error) runNvidiaPowerFn func(string, int, []int) (string, error) @@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat return f.runNvidiaFn(baseDir) } +func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) { + if f.runNvidiaPowerBenchFn != nil { + return f.runNvidiaPowerBenchFn(baseDir, opts) + } + return f.runNvidiaFn(baseDir) +} + func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) { if f.runNvidiaTargetedStressFn != nil { return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 4ddac4c..719b14f 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -2603,3 +2603,279 @@ func runBenchmarkPowerCalibration( } return results, restore } + +func powerBenchDurationSec(profile string) int { + switch strings.TrimSpace(strings.ToLower(profile)) { + case NvidiaBenchmarkProfileStability: + return 300 + case NvidiaBenchmarkProfileOvernight: + return 600 + default: + return 120 + } +} + +func occupiedSlots(indices []int, current int) []int { + out := make([]int, 0, len(indices)) + for _, idx := range indices { + if idx != current { + out = append(out, idx) + } + } + return out +} + +func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo { + out := make(map[int]benchmarkGPUInfo, len(src)) + for k, v := range src { + out[k] = v + } + return out +} + +func renderPowerBenchReport(result NvidiaPowerBenchResult) string { + var b strings.Builder + b.WriteString("# Bee Bench Power Report\n\n") + fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion) + fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) + fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) + fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus) + if len(result.Findings) > 0 { + b.WriteString("## Summary\n\n") + for _, finding := range result.Findings { + fmt.Fprintf(&b, "- %s\n", finding) + } + b.WriteString("\n") + } + if len(result.RecommendedSlotOrder) > 0 { + b.WriteString("## Recommended Slot Order\n\n") + fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder)) + } + if len(result.RampSteps) > 0 { + b.WriteString("## Ramp Sequence\n\n") + b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n") + b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n") + for _, step := range result.RampSteps { + fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n", + step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount) + } + b.WriteString("\n") + } + b.WriteString("## Per-Slot Results\n\n") + b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n") + b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n") + for _, gpu := range result.GPUs { + fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n", + gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts) + } + b.WriteString("\n") + for _, gpu := range result.GPUs { + fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name) + if gpu.OccupiedSlotsNote != "" { + fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote) + } + for _, note := range gpu.Notes { + fmt.Fprintf(&b, "- %s\n", note) + } + b.WriteString("\n") + } + return b.String() +} + +func renderPowerBenchSummary(result NvidiaPowerBenchResult) string { + var b strings.Builder + fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339)) + fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion) + fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile) + fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus) + fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs)) + if len(result.RecommendedSlotOrder) > 0 { + fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder)) + } + for _, step := range result.RampSteps { + fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices)) + fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW) + } + return b.String() +} + +func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + if ctx == nil { + ctx = context.Background() + } + if logFunc == nil { + logFunc = func(string) {} + } + if strings.TrimSpace(baseDir) == "" { + baseDir = "/var/log/bee-bench/power" + } + opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) + selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) + if err != nil { + return "", err + } + if len(selected) == 0 { + return "", fmt.Errorf("no NVIDIA GPUs selected") + } + ts := time.Now().UTC().Format("20060102-150405") + runDir := filepath.Join(baseDir, "power-"+ts) + if err := os.MkdirAll(runDir, 0755); err != nil { + return "", fmt.Errorf("mkdir %s: %w", runDir, err) + } + verboseLog := filepath.Join(runDir, "verbose.log") + infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) + if infoErr != nil { + return "", infoErr + } + hostname, _ := os.Hostname() + result := NvidiaPowerBenchResult{ + BenchmarkVersion: benchmarkVersion, + GeneratedAt: time.Now().UTC(), + Hostname: hostname, + ServerModel: readServerModel(), + BenchmarkProfile: opts.Profile, + SelectedGPUIndices: append([]int(nil), selected...), + OverallStatus: "OK", + } + durationSec := powerBenchDurationSec(opts.Profile) + _ = durationSec + calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc) + defer func() { + for i := len(restoreActions) - 1; i >= 0; i-- { + restoreActions[i].fn() + } + }() + gpus := make([]NvidiaPowerBenchGPU, 0, len(selected)) + for _, idx := range selected { + info := infoByIndex[idx] + calib := calibByIndex[idx] + status := "OK" + if !calib.Completed { + status = "FAILED" + result.OverallStatus = "PARTIAL" + } else if calib.Derated { + status = "PARTIAL" + if result.OverallStatus == "OK" { + result.OverallStatus = "PARTIAL" + } + } + occupied := occupiedSlots(selected, idx) + note := "" + if len(occupied) > 0 { + note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied)) + } + gpus = append(gpus, NvidiaPowerBenchGPU{ + Index: idx, + Name: info.Name, + BusID: info.BusID, + DefaultPowerLimitW: info.DefaultPowerLimitW, + AppliedPowerLimitW: calib.AppliedPowerLimitW, + MaxObservedPowerW: calib.Summary.P95PowerW, + MaxObservedTempC: calib.Summary.P95TempC, + CalibrationAttempts: calib.Attempts, + Derated: calib.Derated, + Status: status, + OccupiedSlots: occupied, + OccupiedSlotsNote: note, + Notes: append([]string(nil), calib.Notes...), + }) + } + sort.Slice(gpus, func(i, j int) bool { + if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW { + return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW + } + if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW { + return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW + } + if gpus[i].Derated != gpus[j].Derated { + return !gpus[i].Derated + } + return gpus[i].Index < gpus[j].Index + }) + result.GPUs = gpus + result.RecommendedSlotOrder = make([]int, 0, len(gpus)) + for _, gpu := range gpus { + result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index) + } + if len(result.RecommendedSlotOrder) > 0 { + result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder))) + } + for _, gpu := range gpus { + if gpu.Derated { + result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW)) + } + } + singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus)) + for _, gpu := range gpus { + singleByIndex[gpu.Index] = gpu + } + for step := 1; step <= len(result.RecommendedSlotOrder); step++ { + subset := append([]int(nil), result.RecommendedSlotOrder[:step]...) + stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step)) + _ = os.MkdirAll(stepDir, 0755) + stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) + stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc) + for i := len(stepRestore) - 1; i >= 0; i-- { + stepRestore[i].fn() + } + ramp := NvidiaPowerBenchStep{ + StepIndex: step, + GPUIndices: subset, + Status: "OK", + } + var realizationValues []float64 + for _, idx := range subset { + calib := stepCalib[idx] + ramp.TotalObservedPowerW += calib.Summary.P95PowerW + if calib.Derated { + ramp.DeratedGPUCount++ + ramp.Status = "PARTIAL" + } + if !calib.Completed { + ramp.Status = "FAILED" + ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step)) + continue + } + if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 { + realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100 + realizationValues = append(realizationValues, realization) + } + } + if len(subset) > 0 { + ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset)) + } + if len(realizationValues) > 0 { + ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues) + ramp.MinPowerRealizationPct = realizationValues[0] + for _, v := range realizationValues[1:] { + if v < ramp.MinPowerRealizationPct { + ramp.MinPowerRealizationPct = v + } + } + } + if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 { + ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step)) + if result.OverallStatus == "OK" { + result.OverallStatus = "PARTIAL" + } + } + if ramp.DeratedGPUCount > 0 { + result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount)) + } + result.RampSteps = append(result.RampSteps, ramp) + } + resultJSON, err := json.MarshalIndent(result, "", " ") + if err != nil { + return "", fmt.Errorf("marshal power result: %w", err) + } + if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { + return "", fmt.Errorf("write result.json: %w", err) + } + if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil { + return "", fmt.Errorf("write report.md: %w", err) + } + if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil { + return "", fmt.Errorf("write summary.txt: %w", err) + } + return runDir, nil +} diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 080a257..6c497b0 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -251,3 +251,45 @@ type BenchmarkInterconnectResult struct { MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"` Notes []string `json:"notes,omitempty"` } + +type NvidiaPowerBenchResult struct { + BenchmarkVersion string `json:"benchmark_version"` + GeneratedAt time.Time `json:"generated_at"` + Hostname string `json:"hostname,omitempty"` + ServerModel string `json:"server_model,omitempty"` + BenchmarkProfile string `json:"benchmark_profile"` + SelectedGPUIndices []int `json:"selected_gpu_indices"` + RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"` + RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"` + OverallStatus string `json:"overall_status"` + Findings []string `json:"findings,omitempty"` + GPUs []NvidiaPowerBenchGPU `json:"gpus"` +} + +type NvidiaPowerBenchGPU struct { + Index int `json:"index"` + Name string `json:"name,omitempty"` + BusID string `json:"bus_id,omitempty"` + DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` + AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"` + MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"` + MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"` + CalibrationAttempts int `json:"calibration_attempts,omitempty"` + Derated bool `json:"derated,omitempty"` + Status string `json:"status"` + OccupiedSlots []int `json:"occupied_slots,omitempty"` + OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"` + Notes []string `json:"notes,omitempty"` +} + +type NvidiaPowerBenchStep struct { + StepIndex int `json:"step_index"` + GPUIndices []int `json:"gpu_indices"` + TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"` + AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"` + MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"` + AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"` + DeratedGPUCount int `json:"derated_gpu_count,omitempty"` + Status string `json:"status"` + Notes []string `json:"notes,omitempty"` +} diff --git a/audit/internal/webui/task_report.go b/audit/internal/webui/task_report.go index 8ec87e6..535c32b 100644 --- a/audit/internal/webui/task_report.go +++ b/audit/internal/webui/task_report.go @@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" { b.WriteString(benchmarkCard) } + if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" { + b.WriteString(powerCard) + } if len(report.Charts) > 0 { for _, chart := range report.Charts { @@ -273,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string { ) } +func renderTaskPowerResultsCard(target, logText string) string { + if strings.TrimSpace(target) != "nvidia-bench-power" { + return "" + } + resultPath := taskBenchmarkResultPath(logText) + if strings.TrimSpace(resultPath) == "" { + return "" + } + raw, err := os.ReadFile(resultPath) + if err != nil { + return "" + } + var result platform.NvidiaPowerBenchResult + if err := json.Unmarshal(raw, &result); err != nil { + return "" + } + var b strings.Builder + b.WriteString(`
Power Results
`) + if len(result.RecommendedSlotOrder) > 0 { + b.WriteString(`

Recommended slot order: ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `

`) + } + b.WriteString(``) + for _, gpu := range result.GPUs { + fmt.Fprintf(&b, ``, + gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW) + } + b.WriteString(`
GPUStatusMax PowerApplied Limit
GPU %d%s%.0f W%.0f W
`) + return b.String() +} + func taskBenchmarkResultPath(logText string) string { archivePath := taskArchivePathFromLog(logText) if archivePath == "" { return "" } runDir := strings.TrimSuffix(archivePath, ".tar.gz") - if runDir == archivePath { - return "" - } return filepath.Join(runDir, "result.json") } diff --git a/audit/internal/webui/tasks.go b/audit/internal/webui/tasks.go index b4665f5..e8f9928 100644 --- a/audit/internal/webui/tasks.go +++ b/audit/internal/webui/tasks.go @@ -650,26 +650,14 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) { err = fmt.Errorf("app not configured") break } - dur := t.params.Duration - if dur <= 0 { - switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) { - case platform.NvidiaBenchmarkProfileStability: - dur = 300 - case platform.NvidiaBenchmarkProfileOvernight: - dur = 600 - default: - dur = 120 - } - } - rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices) - if planErr != nil { - err = planErr - break - } - if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 { - dur = rampPlan.DurationSec - } - archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append) + archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{ + Profile: t.params.BenchmarkProfile, + GPUIndices: t.params.GPUIndices, + ExcludeGPUIndices: t.params.ExcludeGPUIndices, + RampStep: t.params.RampStep, + RampTotal: t.params.RampTotal, + RampRunID: t.params.RampRunID, + }, j.append) case "nvidia-compute": if a == nil { err = fmt.Errorf("app not configured")