Benchmark: parallel GPU mode, resilient inventory query, server model in results
- Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -470,6 +470,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
RunNCCL *bool `json:"run_nccl"`
|
||||
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||
DisplayName string `json:"display_name"`
|
||||
}
|
||||
if r.Body != nil {
|
||||
@@ -483,6 +484,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
parallelGPUs := false
|
||||
if body.ParallelGPUs != nil {
|
||||
parallelGPUs = *body.ParallelGPUs
|
||||
}
|
||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
@@ -493,6 +498,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
ParallelGPUs: parallelGPUs,
|
||||
DisplayName: body.DisplayName,
|
||||
}, name, h.opts.App, "benchmark-nvidia")
|
||||
if err != nil {
|
||||
|
||||
@@ -1625,6 +1625,10 @@ func renderBenchmark(opts HandlerOptions) string {
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
</div>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="checkbox" id="benchmark-parallel-gpus">
|
||||
<span>Run all selected GPUs simultaneously (parallel mode)</span>
|
||||
</label>
|
||||
<label class="benchmark-cb-row">
|
||||
<input type="checkbox" id="benchmark-run-nccl" checked>
|
||||
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
|
||||
@@ -1750,10 +1754,12 @@ function runNvidiaBenchmark() {
|
||||
return;
|
||||
}
|
||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||
const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
|
||||
const body = {
|
||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||
gpu_indices: selected,
|
||||
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
|
||||
parallel_gpus: parallelGPUs,
|
||||
display_name: 'NVIDIA Benchmark'
|
||||
};
|
||||
document.getElementById('benchmark-output').style.display = 'block';
|
||||
@@ -1887,19 +1893,31 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||
cells: make(map[string]benchmarkHistoryCell),
|
||||
}
|
||||
|
||||
// Count how many GPUs of each model appear in this run (for the label).
|
||||
gpuModelCount := make(map[string]int)
|
||||
for _, gpu := range result.GPUs {
|
||||
key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
|
||||
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
||||
}
|
||||
|
||||
// Track best composite score per column key within this run.
|
||||
runBest := make(map[string]float64)
|
||||
for _, gpu := range result.GPUs {
|
||||
key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
|
||||
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
|
||||
label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: gpu.Index,
|
||||
}
|
||||
run.cells[key] = benchmarkHistoryCell{
|
||||
score: gpu.Scores.CompositeScore,
|
||||
present: true,
|
||||
if gpu.Scores.CompositeScore > runBest[key] {
|
||||
runBest[key] = gpu.Scores.CompositeScore
|
||||
}
|
||||
}
|
||||
for key, score := range runBest {
|
||||
run.cells[key] = benchmarkHistoryCell{score: score, present: true}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
}
|
||||
|
||||
@@ -1908,13 +1926,10 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
columns = append(columns, col)
|
||||
}
|
||||
sort.Slice(columns, func(i, j int) bool {
|
||||
leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
|
||||
rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
|
||||
if leftName != rightName {
|
||||
return leftName < rightName
|
||||
}
|
||||
if columns[i].index != columns[j].index {
|
||||
return columns[i].index < columns[j].index
|
||||
li := strings.ToLower(columns[i].label)
|
||||
lj := strings.ToLower(columns[j].label)
|
||||
if li != lj {
|
||||
return li < lj
|
||||
}
|
||||
return columns[i].key < columns[j].key
|
||||
})
|
||||
@@ -1924,16 +1939,25 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
return columns, runs
|
||||
}
|
||||
|
||||
func benchmarkHistoryColumnKey(name string, index int) string {
|
||||
return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
|
||||
// benchmarkHistoryColumnKey groups results by server model + GPU model so that
|
||||
// runs on the same hardware produce one column regardless of individual GPU index.
|
||||
func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
|
||||
return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
|
||||
}
|
||||
|
||||
func benchmarkHistoryColumnLabel(name string, index int) string {
|
||||
name = strings.TrimSpace(name)
|
||||
if name == "" {
|
||||
return fmt.Sprintf("GPU %d", index)
|
||||
// benchmarkHistoryColumnLabel formats the column header as
|
||||
// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
|
||||
func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
|
||||
serverModel = strings.TrimSpace(serverModel)
|
||||
gpuName = strings.TrimSpace(gpuName)
|
||||
if gpuName == "" {
|
||||
gpuName = "Unknown GPU"
|
||||
}
|
||||
return fmt.Sprintf("%s / GPU %d", name, index)
|
||||
gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
|
||||
if serverModel == "" {
|
||||
return gpuPart
|
||||
}
|
||||
return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
|
||||
}
|
||||
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -123,6 +123,7 @@ type taskParams struct {
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
@@ -585,6 +586,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
|
||||
Reference in New Issue
Block a user