Split bee-bench into perf and power workflows

This commit is contained in:
Mikhail Chusavitin
2026-04-14 17:33:13 +03:00
parent 54338dbae5
commit 95124d228f
17 changed files with 718 additions and 259 deletions

View File

@@ -32,7 +32,8 @@ const (
var taskNames = map[string]string{
"nvidia": "NVIDIA SAT",
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-benchmark": "NVIDIA Benchmark",
"nvidia-bench-perf": "NVIDIA Bee Bench Perf",
"nvidia-bench-power": "NVIDIA Bee Bench Power",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -628,7 +629,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
dur = 300
}
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark":
case "nvidia-bench-perf":
if a == nil {
err = fmt.Errorf("app not configured")
break
@@ -644,6 +645,31 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-bench-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
case platform.NvidiaBenchmarkProfileStability:
dur = 300
case platform.NvidiaBenchmarkProfileOvernight:
dur = 600
default:
dur = 120
}
}
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
if planErr != nil {
err = planErr
break
}
if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
dur = rampPlan.DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")