Add slot-aware ramp sequence to bee-bench power

2026-04-14 17:47:40 +03:00
parent 95124d228f
commit 303de2df04
6 changed files with 375 additions and 23 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -2603,3 +2603,279 @@ func runBenchmarkPowerCalibration(
 	}
 	return results, restore
 }
+
+func powerBenchDurationSec(profile string) int {
+	switch strings.TrimSpace(strings.ToLower(profile)) {
+	case NvidiaBenchmarkProfileStability:
+		return 300
+	case NvidiaBenchmarkProfileOvernight:
+		return 600
+	default:
+		return 120
+	}
+}
+
+func occupiedSlots(indices []int, current int) []int {
+	out := make([]int, 0, len(indices))
+	for _, idx := range indices {
+		if idx != current {
+			out = append(out, idx)
+		}
+	}
+	return out
+}
+
+func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
+	out := make(map[int]benchmarkGPUInfo, len(src))
+	for k, v := range src {
+		out[k] = v
+	}
+	return out
+}
+
+func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
+	var b strings.Builder
+	b.WriteString("# Bee Bench Power Report\n\n")
+	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
+	fmt.Fprintf(&b, "**Overall status:** %s  \n\n", result.OverallStatus)
+	if len(result.Findings) > 0 {
+		b.WriteString("## Summary\n\n")
+		for _, finding := range result.Findings {
+			fmt.Fprintf(&b, "- %s\n", finding)
+		}
+		b.WriteString("\n")
+	}
+	if len(result.RecommendedSlotOrder) > 0 {
+		b.WriteString("## Recommended Slot Order\n\n")
+		fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
+	}
+	if len(result.RampSteps) > 0 {
+		b.WriteString("## Ramp Sequence\n\n")
+		b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
+		b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
+		for _, step := range result.RampSteps {
+			fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
+				step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
+		}
+		b.WriteString("\n")
+	}
+	b.WriteString("## Per-Slot Results\n\n")
+	b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
+	b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
+	for _, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
+			gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
+	}
+	b.WriteString("\n")
+	for _, gpu := range result.GPUs {
+		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
+		if gpu.OccupiedSlotsNote != "" {
+			fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
+		}
+		for _, note := range gpu.Notes {
+			fmt.Fprintf(&b, "- %s\n", note)
+		}
+		b.WriteString("\n")
+	}
+	return b.String()
+}
+
+func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
+	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
+	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
+	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
+	if len(result.RecommendedSlotOrder) > 0 {
+		fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
+	}
+	for _, step := range result.RampSteps {
+		fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
+		fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
+	}
+	return b.String()
+}
+
+func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if logFunc == nil {
+		logFunc = func(string) {}
+	}
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = "/var/log/bee-bench/power"
+	}
+	opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
+	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
+	if err != nil {
+		return "", err
+	}
+	if len(selected) == 0 {
+		return "", fmt.Errorf("no NVIDIA GPUs selected")
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "power-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
+	if infoErr != nil {
+		return "", infoErr
+	}
+	hostname, _ := os.Hostname()
+	result := NvidiaPowerBenchResult{
+		BenchmarkVersion:   benchmarkVersion,
+		GeneratedAt:        time.Now().UTC(),
+		Hostname:           hostname,
+		ServerModel:        readServerModel(),
+		BenchmarkProfile:   opts.Profile,
+		SelectedGPUIndices: append([]int(nil), selected...),
+		OverallStatus:      "OK",
+	}
+	durationSec := powerBenchDurationSec(opts.Profile)
+	_ = durationSec
+	calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
+	defer func() {
+		for i := len(restoreActions) - 1; i >= 0; i-- {
+			restoreActions[i].fn()
+		}
+	}()
+	gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
+	for _, idx := range selected {
+		info := infoByIndex[idx]
+		calib := calibByIndex[idx]
+		status := "OK"
+		if !calib.Completed {
+			status = "FAILED"
+			result.OverallStatus = "PARTIAL"
+		} else if calib.Derated {
+			status = "PARTIAL"
+			if result.OverallStatus == "OK" {
+				result.OverallStatus = "PARTIAL"
+			}
+		}
+		occupied := occupiedSlots(selected, idx)
+		note := ""
+		if len(occupied) > 0 {
+			note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
+		}
+		gpus = append(gpus, NvidiaPowerBenchGPU{
+			Index:               idx,
+			Name:                info.Name,
+			BusID:               info.BusID,
+			DefaultPowerLimitW:  info.DefaultPowerLimitW,
+			AppliedPowerLimitW:  calib.AppliedPowerLimitW,
+			MaxObservedPowerW:   calib.Summary.P95PowerW,
+			MaxObservedTempC:    calib.Summary.P95TempC,
+			CalibrationAttempts: calib.Attempts,
+			Derated:             calib.Derated,
+			Status:              status,
+			OccupiedSlots:       occupied,
+			OccupiedSlotsNote:   note,
+			Notes:               append([]string(nil), calib.Notes...),
+		})
+	}
+	sort.Slice(gpus, func(i, j int) bool {
+		if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
+			return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
+		}
+		if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
+			return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
+		}
+		if gpus[i].Derated != gpus[j].Derated {
+			return !gpus[i].Derated
+		}
+		return gpus[i].Index < gpus[j].Index
+	})
+	result.GPUs = gpus
+	result.RecommendedSlotOrder = make([]int, 0, len(gpus))
+	for _, gpu := range gpus {
+		result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
+	}
+	if len(result.RecommendedSlotOrder) > 0 {
+		result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
+	}
+	for _, gpu := range gpus {
+		if gpu.Derated {
+			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
+		}
+	}
+	singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
+	for _, gpu := range gpus {
+		singleByIndex[gpu.Index] = gpu
+	}
+	for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
+		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
+		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
+		_ = os.MkdirAll(stepDir, 0755)
+		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
+		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
+		for i := len(stepRestore) - 1; i >= 0; i-- {
+			stepRestore[i].fn()
+		}
+		ramp := NvidiaPowerBenchStep{
+			StepIndex:  step,
+			GPUIndices: subset,
+			Status:     "OK",
+		}
+		var realizationValues []float64
+		for _, idx := range subset {
+			calib := stepCalib[idx]
+			ramp.TotalObservedPowerW += calib.Summary.P95PowerW
+			if calib.Derated {
+				ramp.DeratedGPUCount++
+				ramp.Status = "PARTIAL"
+			}
+			if !calib.Completed {
+				ramp.Status = "FAILED"
+				ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
+				continue
+			}
+			if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
+				realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
+				realizationValues = append(realizationValues, realization)
+			}
+		}
+		if len(subset) > 0 {
+			ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
+		}
+		if len(realizationValues) > 0 {
+			ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
+			ramp.MinPowerRealizationPct = realizationValues[0]
+			for _, v := range realizationValues[1:] {
+				if v < ramp.MinPowerRealizationPct {
+					ramp.MinPowerRealizationPct = v
+				}
+			}
+		}
+		if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
+			ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
+			if result.OverallStatus == "OK" {
+				result.OverallStatus = "PARTIAL"
+			}
+		}
+		if ramp.DeratedGPUCount > 0 {
+			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
+		}
+		result.RampSteps = append(result.RampSteps, ramp)
+	}
+	resultJSON, err := json.MarshalIndent(result, "", "  ")
+	if err != nil {
+		return "", fmt.Errorf("marshal power result: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
+		return "", fmt.Errorf("write result.json: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
+		return "", fmt.Errorf("write report.md: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
+		return "", fmt.Errorf("write summary.txt: %w", err)
+	}
+	return runDir, nil
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -251,3 +251,45 @@ type BenchmarkInterconnectResult struct {
 	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
 	Notes              []string `json:"notes,omitempty"`
 }
+
+type NvidiaPowerBenchResult struct {
+	BenchmarkVersion     string                 `json:"benchmark_version"`
+	GeneratedAt          time.Time              `json:"generated_at"`
+	Hostname             string                 `json:"hostname,omitempty"`
+	ServerModel          string                 `json:"server_model,omitempty"`
+	BenchmarkProfile     string                 `json:"benchmark_profile"`
+	SelectedGPUIndices   []int                  `json:"selected_gpu_indices"`
+	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
+	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
+	OverallStatus        string                 `json:"overall_status"`
+	Findings             []string               `json:"findings,omitempty"`
+	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
+}
+
+type NvidiaPowerBenchGPU struct {
+	Index               int      `json:"index"`
+	Name                string   `json:"name,omitempty"`
+	BusID               string   `json:"bus_id,omitempty"`
+	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
+	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
+	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
+	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
+	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
+	Derated             bool     `json:"derated,omitempty"`
+	Status              string   `json:"status"`
+	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
+	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
+	Notes               []string `json:"notes,omitempty"`
+}
+
+type NvidiaPowerBenchStep struct {
+	StepIndex              int      `json:"step_index"`
+	GPUIndices             []int    `json:"gpu_indices"`
+	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
+	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
+	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
+	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
+	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
+	Status                 string   `json:"status"`
+	Notes                  []string `json:"notes,omitempty"`
+}