Add slot-aware ramp sequence to bee-bench power

This commit is contained in:
Mikhail Chusavitin
2026-04-14 17:47:40 +03:00
parent 95124d228f
commit 303de2df04
6 changed files with 375 additions and 23 deletions

View File

@@ -2603,3 +2603,279 @@ func runBenchmarkPowerCalibration(
}
return results, restore
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return 300
case NvidiaBenchmarkProfileOvernight:
return 600
default:
return 120
}
}
func occupiedSlots(indices []int, current int) []int {
out := make([]int, 0, len(indices))
for _, idx := range indices {
if idx != current {
out = append(out, idx)
}
}
return out
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
for k, v := range src {
out[k] = v
}
return out
}
func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
var b strings.Builder
b.WriteString("# Bee Bench Power Report\n\n")
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus)
if len(result.Findings) > 0 {
b.WriteString("## Summary\n\n")
for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding)
}
b.WriteString("\n")
}
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString("## Recommended Slot Order\n\n")
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
}
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
for _, step := range result.RampSteps {
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
}
b.WriteString("\n")
}
b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
}
b.WriteString("\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
if gpu.OccupiedSlotsNote != "" {
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
}
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
return b.String()
}
func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
if len(result.RecommendedSlotOrder) > 0 {
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
}
for _, step := range result.RampSteps {
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
}
return b.String()
}
func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-bench/power"
}
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs selected")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "power-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkProfile: opts.Profile,
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
}
}()
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
for _, idx := range selected {
info := infoByIndex[idx]
calib := calibByIndex[idx]
status := "OK"
if !calib.Completed {
status = "FAILED"
result.OverallStatus = "PARTIAL"
} else if calib.Derated {
status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
occupied := occupiedSlots(selected, idx)
note := ""
if len(occupied) > 0 {
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
}
gpus = append(gpus, NvidiaPowerBenchGPU{
Index: idx,
Name: info.Name,
BusID: info.BusID,
DefaultPowerLimitW: info.DefaultPowerLimitW,
AppliedPowerLimitW: calib.AppliedPowerLimitW,
MaxObservedPowerW: calib.Summary.P95PowerW,
MaxObservedTempC: calib.Summary.P95TempC,
CalibrationAttempts: calib.Attempts,
Derated: calib.Derated,
Status: status,
OccupiedSlots: occupied,
OccupiedSlotsNote: note,
Notes: append([]string(nil), calib.Notes...),
})
}
sort.Slice(gpus, func(i, j int) bool {
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
}
if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
}
if gpus[i].Derated != gpus[j].Derated {
return !gpus[i].Derated
}
return gpus[i].Index < gpus[j].Index
})
result.GPUs = gpus
result.RecommendedSlotOrder = make([]int, 0, len(gpus))
for _, gpu := range gpus {
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
}
if len(result.RecommendedSlotOrder) > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
}
for _, gpu := range gpus {
if gpu.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
}
}
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
for _, gpu := range gpus {
singleByIndex[gpu.Index] = gpu
}
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
}
ramp := NvidiaPowerBenchStep{
StepIndex: step,
GPUIndices: subset,
Status: "OK",
}
var realizationValues []float64
for _, idx := range subset {
calib := stepCalib[idx]
ramp.TotalObservedPowerW += calib.Summary.P95PowerW
if calib.Derated {
ramp.DeratedGPUCount++
ramp.Status = "PARTIAL"
}
if !calib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
continue
}
if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
realizationValues = append(realizationValues, realization)
}
}
if len(subset) > 0 {
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
}
if len(realizationValues) > 0 {
ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
ramp.MinPowerRealizationPct = realizationValues[0]
for _, v := range realizationValues[1:] {
if v < ramp.MinPowerRealizationPct {
ramp.MinPowerRealizationPct = v
}
}
}
if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
if ramp.DeratedGPUCount > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
}
result.RampSteps = append(result.RampSteps, ramp)
}
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal power result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return "", fmt.Errorf("write result.json: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
return "", fmt.Errorf("write summary.txt: %w", err)
}
return runDir, nil
}

View File

@@ -251,3 +251,45 @@ type BenchmarkInterconnectResult struct {
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type NvidiaPowerBenchResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
Hostname string `json:"hostname,omitempty"`
ServerModel string `json:"server_model,omitempty"`
BenchmarkProfile string `json:"benchmark_profile"`
SelectedGPUIndices []int `json:"selected_gpu_indices"`
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
OverallStatus string `json:"overall_status"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
}
type NvidiaPowerBenchGPU struct {
Index int `json:"index"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
Derated bool `json:"derated,omitempty"`
Status string `json:"status"`
OccupiedSlots []int `json:"occupied_slots,omitempty"`
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type NvidiaPowerBenchStep struct {
StepIndex int `json:"step_index"`
GPUIndices []int `json:"gpu_indices"`
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
Status string `json:"status"`
Notes []string `json:"notes,omitempty"`
}