Add slot-aware ramp sequence to bee-bench power

2026-04-14 17:47:40 +03:00
parent 95124d228f
commit 303de2df04
6 changed files with 375 additions and 23 deletions
@@ -124,6 +124,7 @@ type satRunner interface {
 	RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
 	RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
 	RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
 	RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
@@ -574,6 +575,13 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
 	return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultBeeBenchPowerDir
 	}
 	return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
 }
 func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = DefaultSATBaseDir
@@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
 type fakeSAT struct {
 	runNvidiaFn               func(string) (string, error)
 	runNvidiaBenchmarkFn      func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaPowerBenchFn     func(string, platform.NvidiaBenchmarkOptions) (string, error)
 	runNvidiaStressFn         func(string, platform.NvidiaStressOptions) (string, error)
 	runNvidiaComputeFn        func(string, int, []int) (string, error)
 	runNvidiaPowerFn          func(string, int, []int) (string, error)
@@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
 	if f.runNvidiaPowerBenchFn != nil {
 		return f.runNvidiaPowerBenchFn(baseDir, opts)
 	}
 	return f.runNvidiaFn(baseDir)
 }
 func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
 	if f.runNvidiaTargetedStressFn != nil {
 		return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
@@ -2603,3 +2603,279 @@ func runBenchmarkPowerCalibration(
 	}
 	return results, restore
 }
 func powerBenchDurationSec(profile string) int {
 	switch strings.TrimSpace(strings.ToLower(profile)) {
 	case NvidiaBenchmarkProfileStability:
 		return 300
 	case NvidiaBenchmarkProfileOvernight:
 		return 600
 	default:
 		return 120
 	}
 }
 func occupiedSlots(indices []int, current int) []int {
 	out := make([]int, 0, len(indices))
 	for _, idx := range indices {
 		if idx != current {
 			out = append(out, idx)
 		}
 	}
 	return out
 }
 func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
 	out := make(map[int]benchmarkGPUInfo, len(src))
 	for k, v := range src {
 		out[k] = v
 	}
 	return out
 }
 func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
 	var b strings.Builder
 	b.WriteString("# Bee Bench Power Report\n\n")
 	fmt.Fprintf(&b, "**Benchmark version:** %s  \n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "**Profile:** %s  \n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "**Generated:** %s  \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
 	fmt.Fprintf(&b, "**Overall status:** %s  \n\n", result.OverallStatus)
 	if len(result.Findings) > 0 {
 		b.WriteString("## Summary\n\n")
 		for _, finding := range result.Findings {
 			fmt.Fprintf(&b, "- %s\n", finding)
 		}
 		b.WriteString("\n")
 	}
 	if len(result.RecommendedSlotOrder) > 0 {
 		b.WriteString("## Recommended Slot Order\n\n")
 		fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
 	}
 	if len(result.RampSteps) > 0 {
 		b.WriteString("## Ramp Sequence\n\n")
 		b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
 		b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
 		for _, step := range result.RampSteps {
 			fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
 				step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
 		}
 		b.WriteString("\n")
 	}
 	b.WriteString("## Per-Slot Results\n\n")
 	b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
 	b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
 			gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
 	}
 	b.WriteString("\n")
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
 		if gpu.OccupiedSlotsNote != "" {
 			fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
 		}
 		for _, note := range gpu.Notes {
 			fmt.Fprintf(&b, "- %s\n", note)
 		}
 		b.WriteString("\n")
 	}
 	return b.String()
 }
 func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
 	var b strings.Builder
 	fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
 	fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
 	fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
 	fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
 	fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
 	if len(result.RecommendedSlotOrder) > 0 {
 		fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
 	}
 	for _, step := range result.RampSteps {
 		fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
 		fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
 	}
 	return b.String()
 }
 func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if logFunc == nil {
 		logFunc = func(string) {}
 	}
 	if strings.TrimSpace(baseDir) == "" {
 		baseDir = "/var/log/bee-bench/power"
 	}
 	opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
 	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
 	if err != nil {
 		return "", err
 	}
 	if len(selected) == 0 {
 		return "", fmt.Errorf("no NVIDIA GPUs selected")
 	}
 	ts := time.Now().UTC().Format("20060102-150405")
 	runDir := filepath.Join(baseDir, "power-"+ts)
 	if err := os.MkdirAll(runDir, 0755); err != nil {
 		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
 	}
 	verboseLog := filepath.Join(runDir, "verbose.log")
 	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
 	if infoErr != nil {
 		return "", infoErr
 	}
 	hostname, _ := os.Hostname()
 	result := NvidiaPowerBenchResult{
 		BenchmarkVersion:   benchmarkVersion,
 		GeneratedAt:        time.Now().UTC(),
 		Hostname:           hostname,
 		ServerModel:        readServerModel(),
 		BenchmarkProfile:   opts.Profile,
 		SelectedGPUIndices: append([]int(nil), selected...),
 		OverallStatus:      "OK",
 	}
 	durationSec := powerBenchDurationSec(opts.Profile)
 	_ = durationSec
 	calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
 	defer func() {
 		for i := len(restoreActions) - 1; i >= 0; i-- {
 			restoreActions[i].fn()
 		}
 	}()
 	gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
 	for _, idx := range selected {
 		info := infoByIndex[idx]
 		calib := calibByIndex[idx]
 		status := "OK"
 		if !calib.Completed {
 			status = "FAILED"
 			result.OverallStatus = "PARTIAL"
 		} else if calib.Derated {
 			status = "PARTIAL"
 			if result.OverallStatus == "OK" {
 				result.OverallStatus = "PARTIAL"
 			}
 		}
 		occupied := occupiedSlots(selected, idx)
 		note := ""
 		if len(occupied) > 0 {
 			note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
 		}
 		gpus = append(gpus, NvidiaPowerBenchGPU{
 			Index:               idx,
 			Name:                info.Name,
 			BusID:               info.BusID,
 			DefaultPowerLimitW:  info.DefaultPowerLimitW,
 			AppliedPowerLimitW:  calib.AppliedPowerLimitW,
 			MaxObservedPowerW:   calib.Summary.P95PowerW,
 			MaxObservedTempC:    calib.Summary.P95TempC,
 			CalibrationAttempts: calib.Attempts,
 			Derated:             calib.Derated,
 			Status:              status,
 			OccupiedSlots:       occupied,
 			OccupiedSlotsNote:   note,
 			Notes:               append([]string(nil), calib.Notes...),
 		})
 	}
 	sort.Slice(gpus, func(i, j int) bool {
 		if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
 			return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
 		}
 		if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
 			return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
 		}
 		if gpus[i].Derated != gpus[j].Derated {
 			return !gpus[i].Derated
 		}
 		return gpus[i].Index < gpus[j].Index
 	})
 	result.GPUs = gpus
 	result.RecommendedSlotOrder = make([]int, 0, len(gpus))
 	for _, gpu := range gpus {
 		result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
 	}
 	if len(result.RecommendedSlotOrder) > 0 {
 		result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
 	}
 	for _, gpu := range gpus {
 		if gpu.Derated {
 			result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
 		}
 	}
 	singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
 	for _, gpu := range gpus {
 		singleByIndex[gpu.Index] = gpu
 	}
 	for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
 		subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
 		stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
 		_ = os.MkdirAll(stepDir, 0755)
 		stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
 		stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
 		for i := len(stepRestore) - 1; i >= 0; i-- {
 			stepRestore[i].fn()
 		}
 		ramp := NvidiaPowerBenchStep{
 			StepIndex:  step,
 			GPUIndices: subset,
 			Status:     "OK",
 		}
 		var realizationValues []float64
 		for _, idx := range subset {
 			calib := stepCalib[idx]
 			ramp.TotalObservedPowerW += calib.Summary.P95PowerW
 			if calib.Derated {
 				ramp.DeratedGPUCount++
 				ramp.Status = "PARTIAL"
 			}
 			if !calib.Completed {
 				ramp.Status = "FAILED"
 				ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
 				continue
 			}
 			if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
 				realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
 				realizationValues = append(realizationValues, realization)
 			}
 		}
 		if len(subset) > 0 {
 			ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
 		}
 		if len(realizationValues) > 0 {
 			ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
 			ramp.MinPowerRealizationPct = realizationValues[0]
 			for _, v := range realizationValues[1:] {
 				if v < ramp.MinPowerRealizationPct {
 					ramp.MinPowerRealizationPct = v
 				}
 			}
 		}
 		if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
 			ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
 			if result.OverallStatus == "OK" {
 				result.OverallStatus = "PARTIAL"
 			}
 		}
 		if ramp.DeratedGPUCount > 0 {
 			result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
 		}
 		result.RampSteps = append(result.RampSteps, ramp)
 	}
 	resultJSON, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return "", fmt.Errorf("marshal power result: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
 		return "", fmt.Errorf("write result.json: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
 		return "", fmt.Errorf("write report.md: %w", err)
 	}
 	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
 		return "", fmt.Errorf("write summary.txt: %w", err)
 	}
 	return runDir, nil
 }
@@ -251,3 +251,45 @@ type BenchmarkInterconnectResult struct {
 	MaxBusBWGBps       float64  `json:"max_busbw_gbps,omitempty"`
 	Notes              []string `json:"notes,omitempty"`
 }
 type NvidiaPowerBenchResult struct {
 	BenchmarkVersion     string                 `json:"benchmark_version"`
 	GeneratedAt          time.Time              `json:"generated_at"`
 	Hostname             string                 `json:"hostname,omitempty"`
 	ServerModel          string                 `json:"server_model,omitempty"`
 	BenchmarkProfile     string                 `json:"benchmark_profile"`
 	SelectedGPUIndices   []int                  `json:"selected_gpu_indices"`
 	RecommendedSlotOrder []int                  `json:"recommended_slot_order,omitempty"`
 	RampSteps            []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
 	OverallStatus        string                 `json:"overall_status"`
 	Findings             []string               `json:"findings,omitempty"`
 	GPUs                 []NvidiaPowerBenchGPU  `json:"gpus"`
 }
 type NvidiaPowerBenchGPU struct {
 	Index               int      `json:"index"`
 	Name                string   `json:"name,omitempty"`
 	BusID               string   `json:"bus_id,omitempty"`
 	DefaultPowerLimitW  float64  `json:"default_power_limit_w,omitempty"`
 	AppliedPowerLimitW  float64  `json:"applied_power_limit_w,omitempty"`
 	MaxObservedPowerW   float64  `json:"max_observed_power_w,omitempty"`
 	MaxObservedTempC    float64  `json:"max_observed_temp_c,omitempty"`
 	CalibrationAttempts int      `json:"calibration_attempts,omitempty"`
 	Derated             bool     `json:"derated,omitempty"`
 	Status              string   `json:"status"`
 	OccupiedSlots       []int    `json:"occupied_slots,omitempty"`
 	OccupiedSlotsNote   string   `json:"occupied_slots_note,omitempty"`
 	Notes               []string `json:"notes,omitempty"`
 }
 type NvidiaPowerBenchStep struct {
 	StepIndex              int      `json:"step_index"`
 	GPUIndices             []int    `json:"gpu_indices"`
 	TotalObservedPowerW    float64  `json:"total_observed_power_w,omitempty"`
 	AvgObservedPowerW      float64  `json:"avg_observed_power_w,omitempty"`
 	MinPowerRealizationPct float64  `json:"min_power_realization_pct,omitempty"`
 	AvgPowerRealizationPct float64  `json:"avg_power_realization_pct,omitempty"`
 	DeratedGPUCount        int      `json:"derated_gpu_count,omitempty"`
 	Status                 string   `json:"status"`
 	Notes                  []string `json:"notes,omitempty"`
 }
@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
 	if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
 		b.WriteString(benchmarkCard)
 	}
 	if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
 		b.WriteString(powerCard)
 	}
 	if len(report.Charts) > 0 {
 		for _, chart := range report.Charts {
@@ -273,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
 	)
 }
 func renderTaskPowerResultsCard(target, logText string) string {
 	if strings.TrimSpace(target) != "nvidia-bench-power" {
 		return ""
 	}
 	resultPath := taskBenchmarkResultPath(logText)
 	if strings.TrimSpace(resultPath) == "" {
 		return ""
 	}
 	raw, err := os.ReadFile(resultPath)
 	if err != nil {
 		return ""
 	}
 	var result platform.NvidiaPowerBenchResult
 	if err := json.Unmarshal(raw, &result); err != nil {
 		return ""
 	}
 	var b strings.Builder
 	b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
 	if len(result.RecommendedSlotOrder) > 0 {
 		b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
 	}
 	b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
 	for _, gpu := range result.GPUs {
 		fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
 			gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
 	}
 	b.WriteString(`</table></div></div>`)
 	return b.String()
 }
 func taskBenchmarkResultPath(logText string) string {
 	archivePath := taskArchivePathFromLog(logText)
 	if archivePath == "" {
 		return ""
 	}
 	runDir := strings.TrimSuffix(archivePath, ".tar.gz")
 	if runDir == archivePath {
 		return ""
 	}
 	return filepath.Join(runDir, "result.json")
 }
@@ -650,26 +650,14 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			err = fmt.Errorf("app not configured")
 			break
 		}
-		dur := t.params.Duration
+		archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
-		if dur <= 0 {
+			Profile:           t.params.BenchmarkProfile,
-			switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
+			GPUIndices:        t.params.GPUIndices,
-			case platform.NvidiaBenchmarkProfileStability:
+			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
-				dur = 300
+			RampStep:          t.params.RampStep,
-			case platform.NvidiaBenchmarkProfileOvernight:
+			RampTotal:         t.params.RampTotal,
-				dur = 600
+			RampRunID:         t.params.RampRunID,
-			default:
+		}, j.append)
 				dur = 120
 			}
 		}
 		rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
 		if planErr != nil {
 			err = planErr
 			break
 		}
 		if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
 			dur = rampPlan.DurationSec
 		}
 		archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
 	case "nvidia-compute":
 		if a == nil {
 			err = fmt.Errorf("app not configured")