Benchmark: parallel GPU mode, resilient inventory query, server model in results

- Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 18:32:15 +03:00
parent 1358485f2b
commit 93cfa78e8c
5 changed files with 389 additions and 71 deletions
--- a/audit/internal/webui/api.go
+++ b/audit/internal/webui/api.go
@@ -470,6 +470,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		GPUIndices        []int  `json:"gpu_indices"`
 		ExcludeGPUIndices []int  `json:"exclude_gpu_indices"`
 		RunNCCL           *bool  `json:"run_nccl"`
+		ParallelGPUs      *bool  `json:"parallel_gpus"`
 		DisplayName       string `json:"display_name"`
 	}
 	if r.Body != nil {
@@ -483,6 +484,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 	if body.RunNCCL != nil {
 		runNCCL = *body.RunNCCL
 	}
+	parallelGPUs := false
+	if body.ParallelGPUs != nil {
+		parallelGPUs = *body.ParallelGPUs
+	}
 	name := taskDisplayName("nvidia-benchmark", "", "")
 	if strings.TrimSpace(body.DisplayName) != "" {
 		name = body.DisplayName
@@ -493,6 +498,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
 		SizeMB:            body.SizeMB,
 		BenchmarkProfile:  body.Profile,
 		RunNCCL:           runNCCL,
+		ParallelGPUs:      parallelGPUs,
 		DisplayName:       body.DisplayName,
 	}, name, h.opts.App, "benchmark-nvidia")
 	if err != nil {
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1625,6 +1625,10 @@ func renderBenchmark(opts HandlerOptions) string {
          <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
        </div>
      </div>
+      <label class="benchmark-cb-row">
+        <input type="checkbox" id="benchmark-parallel-gpus">
+        <span>Run all selected GPUs simultaneously (parallel mode)</span>
+      </label>
      <label class="benchmark-cb-row">
        <input type="checkbox" id="benchmark-run-nccl" checked>
        <span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
@@ -1750,10 +1754,12 @@ function runNvidiaBenchmark() {
    return;
  }
  if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
+  const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
  const body = {
    profile: document.getElementById('benchmark-profile').value || 'standard',
    gpu_indices: selected,
    run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
+    parallel_gpus: parallelGPUs,
    display_name: 'NVIDIA Benchmark'
  };
  document.getElementById('benchmark-output').style.display = 'block';
@@ -1887,19 +1893,31 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 			displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
 			cells:       make(map[string]benchmarkHistoryCell),
 		}
+
+		// Count how many GPUs of each model appear in this run (for the label).
+		gpuModelCount := make(map[string]int)
 		for _, gpu := range result.GPUs {
-			key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
+			gpuModelCount[strings.TrimSpace(gpu.Name)]++
+		}
+
+		// Track best composite score per column key within this run.
+		runBest := make(map[string]float64)
+		for _, gpu := range result.GPUs {
+			key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
+			count := gpuModelCount[strings.TrimSpace(gpu.Name)]
 			columnByKey[key] = benchmarkHistoryColumn{
 				key:   key,
-				label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
+				label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
 				name:  strings.TrimSpace(gpu.Name),
 				index: gpu.Index,
 			}
-			run.cells[key] = benchmarkHistoryCell{
-				score:   gpu.Scores.CompositeScore,
-				present: true,
+			if gpu.Scores.CompositeScore > runBest[key] {
+				runBest[key] = gpu.Scores.CompositeScore
 			}
 		}
+		for key, score := range runBest {
+			run.cells[key] = benchmarkHistoryCell{score: score, present: true}
+		}
 		runs = append(runs, run)
 	}

@@ -1908,13 +1926,10 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 		columns = append(columns, col)
 	}
 	sort.Slice(columns, func(i, j int) bool {
-		leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
-		rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
-		if leftName != rightName {
-			return leftName < rightName
-		}
-		if columns[i].index != columns[j].index {
-			return columns[i].index < columns[j].index
+		li := strings.ToLower(columns[i].label)
+		lj := strings.ToLower(columns[j].label)
+		if li != lj {
+			return li < lj
 		}
 		return columns[i].key < columns[j].key
 	})
@@ -1924,16 +1939,25 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
 	return columns, runs
 }

-func benchmarkHistoryColumnKey(name string, index int) string {
-	return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
+// benchmarkHistoryColumnKey groups results by server model + GPU model so that
+// runs on the same hardware produce one column regardless of individual GPU index.
+func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
+	return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
 }

-func benchmarkHistoryColumnLabel(name string, index int) string {
-	name = strings.TrimSpace(name)
-	if name == "" {
-		return fmt.Sprintf("GPU %d", index)
+// benchmarkHistoryColumnLabel formats the column header as
+// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
+func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
+	serverModel = strings.TrimSpace(serverModel)
+	gpuName = strings.TrimSpace(gpuName)
+	if gpuName == "" {
+		gpuName = "Unknown GPU"
 	}
-	return fmt.Sprintf("%s / GPU %d", name, index)
+	gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
+	if serverModel == "" {
+		return gpuPart
+	}
+	return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
 }

 // ── Burn ──────────────────────────────────────────────────────────────────────
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -123,6 +123,7 @@ type taskParams struct {
 	BurnProfile        string   `json:"burn_profile,omitempty"`
 	BenchmarkProfile   string   `json:"benchmark_profile,omitempty"`
 	RunNCCL            bool     `json:"run_nccl,omitempty"`
+	ParallelGPUs       bool     `json:"parallel_gpus,omitempty"`
 	DisplayName        string   `json:"display_name,omitempty"`
 	Device             string   `json:"device,omitempty"` // for install
 	PlatformComponents []string `json:"platform_components,omitempty"`
@@ -585,6 +586,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			GPUIndices:        t.params.GPUIndices,
 			ExcludeGPUIndices: t.params.ExcludeGPUIndices,
 			RunNCCL:           t.params.RunNCCL,
+			ParallelGPUs:      t.params.ParallelGPUs,
 		}, j.append)
 	case "nvidia-compute":
 		if a == nil {