Add slot-aware ramp sequence to bee-bench power

This commit is contained in:
Mikhail Chusavitin
2026-04-14 17:47:40 +03:00
parent 95124d228f
commit 303de2df04
6 changed files with 375 additions and 23 deletions

View File

@@ -124,6 +124,7 @@ type satRunner interface {
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -574,6 +575,13 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPowerDir
}
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir

View File

@@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
type fakeSAT struct {
runNvidiaFn func(string) (string, error)
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
runNvidiaComputeFn func(string, int, []int) (string, error)
runNvidiaPowerFn func(string, int, []int) (string, error)
@@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
if f.runNvidiaPowerBenchFn != nil {
return f.runNvidiaPowerBenchFn(baseDir, opts)
}
return f.runNvidiaFn(baseDir)
}
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
if f.runNvidiaTargetedStressFn != nil {
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)

View File

@@ -2603,3 +2603,279 @@ func runBenchmarkPowerCalibration(
}
return results, restore
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return 300
case NvidiaBenchmarkProfileOvernight:
return 600
default:
return 120
}
}
func occupiedSlots(indices []int, current int) []int {
out := make([]int, 0, len(indices))
for _, idx := range indices {
if idx != current {
out = append(out, idx)
}
}
return out
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
for k, v := range src {
out[k] = v
}
return out
}
func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
var b strings.Builder
b.WriteString("# Bee Bench Power Report\n\n")
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus)
if len(result.Findings) > 0 {
b.WriteString("## Summary\n\n")
for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding)
}
b.WriteString("\n")
}
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString("## Recommended Slot Order\n\n")
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
}
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
for _, step := range result.RampSteps {
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
}
b.WriteString("\n")
}
b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
}
b.WriteString("\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
if gpu.OccupiedSlotsNote != "" {
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
}
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
return b.String()
}
func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
if len(result.RecommendedSlotOrder) > 0 {
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
}
for _, step := range result.RampSteps {
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
}
return b.String()
}
func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-bench/power"
}
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs selected")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "power-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkProfile: opts.Profile,
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
}
}()
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
for _, idx := range selected {
info := infoByIndex[idx]
calib := calibByIndex[idx]
status := "OK"
if !calib.Completed {
status = "FAILED"
result.OverallStatus = "PARTIAL"
} else if calib.Derated {
status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
occupied := occupiedSlots(selected, idx)
note := ""
if len(occupied) > 0 {
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
}
gpus = append(gpus, NvidiaPowerBenchGPU{
Index: idx,
Name: info.Name,
BusID: info.BusID,
DefaultPowerLimitW: info.DefaultPowerLimitW,
AppliedPowerLimitW: calib.AppliedPowerLimitW,
MaxObservedPowerW: calib.Summary.P95PowerW,
MaxObservedTempC: calib.Summary.P95TempC,
CalibrationAttempts: calib.Attempts,
Derated: calib.Derated,
Status: status,
OccupiedSlots: occupied,
OccupiedSlotsNote: note,
Notes: append([]string(nil), calib.Notes...),
})
}
sort.Slice(gpus, func(i, j int) bool {
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
}
if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
}
if gpus[i].Derated != gpus[j].Derated {
return !gpus[i].Derated
}
return gpus[i].Index < gpus[j].Index
})
result.GPUs = gpus
result.RecommendedSlotOrder = make([]int, 0, len(gpus))
for _, gpu := range gpus {
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
}
if len(result.RecommendedSlotOrder) > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
}
for _, gpu := range gpus {
if gpu.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
}
}
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
for _, gpu := range gpus {
singleByIndex[gpu.Index] = gpu
}
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
}
ramp := NvidiaPowerBenchStep{
StepIndex: step,
GPUIndices: subset,
Status: "OK",
}
var realizationValues []float64
for _, idx := range subset {
calib := stepCalib[idx]
ramp.TotalObservedPowerW += calib.Summary.P95PowerW
if calib.Derated {
ramp.DeratedGPUCount++
ramp.Status = "PARTIAL"
}
if !calib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
continue
}
if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
realizationValues = append(realizationValues, realization)
}
}
if len(subset) > 0 {
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
}
if len(realizationValues) > 0 {
ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
ramp.MinPowerRealizationPct = realizationValues[0]
for _, v := range realizationValues[1:] {
if v < ramp.MinPowerRealizationPct {
ramp.MinPowerRealizationPct = v
}
}
}
if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
if ramp.DeratedGPUCount > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
}
result.RampSteps = append(result.RampSteps, ramp)
}
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal power result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return "", fmt.Errorf("write result.json: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
return "", fmt.Errorf("write summary.txt: %w", err)
}
return runDir, nil
}

View File

@@ -251,3 +251,45 @@ type BenchmarkInterconnectResult struct {
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type NvidiaPowerBenchResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
}
type NvidiaPowerBenchGPU struct {
Index int `json:"index"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
Derated bool `json:"derated,omitempty"`
Status string `json:"status"`
OccupiedSlots []int `json:"occupied_slots,omitempty"`
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type NvidiaPowerBenchStep struct {
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
}

View File

@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
b.WriteString(benchmarkCard)
}
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
b.WriteString(powerCard)
}
if len(report.Charts) > 0 {
for _, chart := range report.Charts {
@@ -273,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
)
}
func renderTaskPowerResultsCard(target, logText string) string {
if strings.TrimSpace(target) != "nvidia-bench-power" {
return ""
}
resultPath := taskBenchmarkResultPath(logText)
if strings.TrimSpace(resultPath) == "" {
return ""
}
raw, err := os.ReadFile(resultPath)
if err != nil {
return ""
}
var result platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &result); err != nil {
return ""
}
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
}
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
}
b.WriteString(`</table></div></div>`)
return b.String()
}
func taskBenchmarkResultPath(logText string) string {
archivePath := taskArchivePathFromLog(logText)
if archivePath == "" {
return ""
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
if runDir == archivePath {
return ""
}
return filepath.Join(runDir, "result.json")
}

View File

@@ -650,26 +650,14 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
case platform.NvidiaBenchmarkProfileStability:
dur = 300
case platform.NvidiaBenchmarkProfileOvernight:
dur = 600
default:
dur = 120
}
}
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
if planErr != nil {
err = planErr
break
}
if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
dur = rampPlan.DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RampStep: t.params.RampStep,
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")