Add slot-aware ramp sequence to bee-bench power

This commit is contained in:
Mikhail Chusavitin
2026-04-14 17:47:40 +03:00
parent 95124d228f
commit 303de2df04
6 changed files with 375 additions and 23 deletions

View File

@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
b.WriteString(benchmarkCard)
}
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
b.WriteString(powerCard)
}
if len(report.Charts) > 0 {
for _, chart := range report.Charts {
@@ -273,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
)
}
func renderTaskPowerResultsCard(target, logText string) string {
if strings.TrimSpace(target) != "nvidia-bench-power" {
return ""
}
resultPath := taskBenchmarkResultPath(logText)
if strings.TrimSpace(resultPath) == "" {
return ""
}
raw, err := os.ReadFile(resultPath)
if err != nil {
return ""
}
var result platform.NvidiaPowerBenchResult
if err := json.Unmarshal(raw, &result); err != nil {
return ""
}
var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
}
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
}
b.WriteString(`</table></div></div>`)
return b.String()
}
func taskBenchmarkResultPath(logText string) string {
archivePath := taskArchivePathFromLog(logText)
if archivePath == "" {
return ""
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
if runDir == archivePath {
return ""
}
return filepath.Join(runDir, "result.json")
}

View File

@@ -650,26 +650,14 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
case platform.NvidiaBenchmarkProfileStability:
dur = 300
case platform.NvidiaBenchmarkProfileOvernight:
dur = 600
default:
dur = 120
}
}
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
if planErr != nil {
err = planErr
break
}
if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
dur = rampPlan.DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
Profile: t.params.BenchmarkProfile,
GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
RampStep: t.params.RampStep,
RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID,
}, j.append)
case "nvidia-compute":
if a == nil {
err = fmt.Errorf("app not configured")