From 93cfa78e8c1636551a463aaf3245320b70c33923 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Tue, 7 Apr 2026 18:32:15 +0300 Subject: [PATCH] Benchmark: parallel GPU mode, resilient inventory query, server model in results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 386 ++++++++++++++++++--- audit/internal/platform/benchmark_types.go | 4 + audit/internal/webui/api.go | 6 + audit/internal/webui/pages.go | 62 +++- audit/internal/webui/tasks.go | 2 + 5 files changed, 389 insertions(+), 71 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 814ec25..23f5a31 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -105,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv BenchmarkVersion: benchmarkVersion, GeneratedAt: time.Now().UTC(), Hostname: hostname, + ServerModel: readServerModel(), BenchmarkProfile: spec.Name, + ParallelGPUs: opts.ParallelGPUs, SelectedGPUIndices: append([]int(nil), selected...), Normalization: BenchmarkNormalization{ Status: "full", @@ -143,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } }() + if opts.ParallelGPUs { + runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples) + } else { + for _, idx := range selected { gpuResult := BenchmarkGPUResult{ Index: idx, @@ -285,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) } + } // end sequential path + if len(selected) > 1 && opts.RunNCCL { result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc) if result.Interconnect != nil && result.Interconnect.Supported { @@ -362,60 +370,87 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { } } -func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { - args := []string{ - "--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit", - "--format=csv,noheader,nounits", - } - if len(gpuIndices) > 0 { - args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) - } - out, err := satExecCommand("nvidia-smi", args...).Output() - if err != nil { - return nil, fmt.Errorf("nvidia-smi gpu info: %w", err) - } - - r := csv.NewReader(strings.NewReader(string(out))) - r.TrimLeadingSpace = true - r.FieldsPerRecord = -1 - rows, err := r.ReadAll() - if err != nil { - return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err) - } - - infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) - for _, row := range rows { - if len(row) < 9 { - continue - } - idx, err := strconv.Atoi(strings.TrimSpace(row[0])) - if err != nil { - continue - } - info := benchmarkGPUInfo{ - Index: idx, - UUID: strings.TrimSpace(row[1]), - Name: strings.TrimSpace(row[2]), - BusID: strings.TrimSpace(row[3]), - VBIOS: strings.TrimSpace(row[4]), - PowerLimitW: parseBenchmarkFloat(row[5]), - MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]), - MaxMemoryClockMHz: parseBenchmarkFloat(row[7]), - } - if len(row) >= 9 { - info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) - } - if len(row) >= 10 { - info.MultiprocessorCount = int(parseBenchmarkFloat(row[9])) - } - if len(row) >= 11 { - info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) - } - infoByIndex[idx] = info - } - return infoByIndex, nil +// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try. +// Fields are tried in order; the first successful query wins. Extended fields +// (attribute.multiprocessor_count, power.default_limit) are not supported on +// all driver versions, so we fall back to the base set if the full query fails. +var benchmarkGPUInfoQueries = []struct { + fields string + extended bool // whether this query includes optional extended fields +}{ + { + fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit", + extended: true, + }, + { + fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics", + extended: false, + }, } +func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { + var lastErr error + for _, q := range benchmarkGPUInfoQueries { + args := []string{ + "--query-gpu=" + q.fields, + "--format=csv,noheader,nounits", + } + if len(gpuIndices) > 0 { + args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) + } + out, err := satExecCommand("nvidia-smi", args...).Output() + if err != nil { + lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err) + continue + } + + r := csv.NewReader(strings.NewReader(string(out))) + r.TrimLeadingSpace = true + r.FieldsPerRecord = -1 + rows, err := r.ReadAll() + if err != nil { + lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err) + continue + } + + infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) + for _, row := range rows { + if len(row) < 9 { + continue + } + idx, err := strconv.Atoi(strings.TrimSpace(row[0])) + if err != nil { + continue + } + info := benchmarkGPUInfo{ + Index: idx, + UUID: strings.TrimSpace(row[1]), + Name: strings.TrimSpace(row[2]), + BusID: strings.TrimSpace(row[3]), + VBIOS: strings.TrimSpace(row[4]), + PowerLimitW: parseBenchmarkFloat(row[5]), + MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]), + MaxMemoryClockMHz: parseBenchmarkFloat(row[7]), + } + if len(row) >= 9 { + info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) + } + if q.extended { + if len(row) >= 10 { + info.MultiprocessorCount = int(parseBenchmarkFloat(row[9])) + } + if len(row) >= 11 { + info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) + } + } + infoByIndex[idx] = info + } + return infoByIndex, nil + } + return nil, lastErr +} + + func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { if os.Geteuid() != 0 { result.Normalization.Status = "partial" @@ -454,6 +489,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil) }}) } + } else { + rec.GPUClockLockStatus = "skipped" + rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0") + result.Normalization.Status = "partial" } if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 { @@ -1209,3 +1248,246 @@ func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvaila } return sp } + +// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT"). +// Returns empty string if unavailable (non-Linux or missing DMI entry). +func readServerModel() string { + data, err := os.ReadFile("/sys/class/dmi/id/product_name") + if err != nil { + return "" + } + return strings.TrimSpace(string(data)) +} + +// filterRowsByGPU returns only the metric rows for a specific GPU index. +func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow { + var out []GPUMetricRow + for _, r := range rows { + if r.GPUIndex == gpuIndex { + out = append(out, r) + } + } + return out +} + +// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix +// and returns a per-GPU parse result map. +func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult { + gpuLines := make(map[int][]string) + for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") { + line = strings.TrimSpace(line) + if !strings.HasPrefix(line, "[gpu ") { + continue + } + end := strings.Index(line, "] ") + if end < 0 { + continue + } + gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end])) + if err != nil { + continue + } + gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:]) + } + results := make(map[int]benchmarkBurnParseResult, len(gpuLines)) + for gpuIdx, lines := range gpuLines { + // Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog + // calls stripBenchmarkPrefix which is a no-op on already-stripped lines. + results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n")) + } + return results +} + +// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs +// simultaneously using a single bee-gpu-burn invocation per phase. +func runNvidiaBenchmarkParallel( + ctx context.Context, + verboseLog, runDir string, + selected []int, + infoByIndex map[int]benchmarkGPUInfo, + opts NvidiaBenchmarkOptions, + spec benchmarkProfileSpec, + logFunc func(string), + result *NvidiaBenchmarkResult, + serverIdleW *float64, serverLoadedWSum *float64, + serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, +) { + allDevices := joinIndexList(selected) + + // Build per-GPU result stubs. + gpuResults := make(map[int]*BenchmarkGPUResult, len(selected)) + for _, idx := range selected { + r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"} + if info, ok := infoByIndex[idx]; ok { + r.UUID = info.UUID + r.Name = info.Name + r.BusID = info.BusID + r.VBIOS = info.VBIOS + r.PowerLimitW = info.PowerLimitW + r.MultiprocessorCount = info.MultiprocessorCount + r.DefaultPowerLimitW = info.DefaultPowerLimitW + r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz + r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz + r.MaxMemoryClockMHz = info.MaxMemoryClockMHz + } + if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { + r.LockedGraphicsClockMHz = norm.GPUClockLockMHz + r.LockedMemoryClockMHz = norm.MemoryClockLockMHz + } + gpuResults[idx] = r + } + + // Baseline: sample all GPUs together. + baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected) + if err != nil && err != context.Canceled { + for _, idx := range selected { + gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error()) + } + } + for _, idx := range selected { + perGPU := filterRowsByGPU(baselineRows, idx) + gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) + writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU) + } + + // Sample server idle power once. + if !*serverIdleOK { + if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { + *serverIdleW = w + *serverIdleOK = true + logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + } + } + + // Warmup: all GPUs simultaneously. + warmupCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(spec.WarmupSec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", allDevices, + } + logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) + warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc) + _ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644) + for _, idx := range selected { + writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx)) + } + if warmupErr != nil { + for _, idx := range selected { + gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) + } + } + + // Snapshot throttle counters before steady. + beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) + for _, idx := range selected { + beforeThrottle[idx], _ = queryThrottleCounters(idx) + } + + // Steady: all GPUs simultaneously. + steadyCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(spec.SteadySec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", allDevices, + } + logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec)) + + // Sample server power via IPMI in parallel with steady phase. + ipmiStopCh := make(chan struct{}) + ipmiResultCh := make(chan float64, 1) + go func() { + defer close(ipmiResultCh) + var samples []float64 + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + select { + case <-ipmiStopCh: + return + case <-time.After(15 * time.Second): + } + for { + if w, err := queryIPMIServerPowerW(); err == nil { + samples = append(samples, w) + } + select { + case <-ipmiStopCh: + if len(samples) > 0 { + var sum float64 + for _, w := range samples { + sum += w + } + ipmiResultCh <- sum / float64(len(samples)) + } + return + case <-ticker.C: + } + } + }() + + steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc) + close(ipmiStopCh) + if loadedW, ok := <-ipmiResultCh; ok { + *serverLoadedWSum += loadedW + (*serverLoadedSamples)++ + *serverLoadedOK = true + logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW)) + } + _ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644) + + afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) + for _, idx := range selected { + afterThrottle[idx], _ = queryThrottleCounters(idx) + } + + parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut)) + + for _, idx := range selected { + perGPU := filterRowsByGPU(steadyRows, idx) + writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU) + gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU) + gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx]) + + if pr, ok := parseResults[idx]; ok { + gpuResults[idx].ComputeCapability = pr.ComputeCapability + gpuResults[idx].Backend = pr.Backend + gpuResults[idx].PrecisionResults = pr.Profiles + if pr.Fallback { + gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") + } + } + if steadyErr != nil { + gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error()) + } + } + + // Cooldown: all GPUs together. + cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected) + if err != nil && err != context.Canceled { + for _, idx := range selected { + gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error()) + } + } + for _, idx := range selected { + perGPU := filterRowsByGPU(cooldownRows, idx) + gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) + writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU) + } + + // Score and finalize each GPU. + for _, idx := range selected { + r := gpuResults[idx] + r.Scores = scoreBenchmarkGPUResult(*r) + r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status) + pr := parseResults[idx] + switch { + case steadyErr != nil: + r.Status = classifySATErrorStatus(steadyOut, steadyErr) + case pr.Fallback: + r.Status = "PARTIAL" + default: + r.Status = "OK" + } + result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r)) + } +} diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 8861e61..63c9c51 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct { GPUIndices []int ExcludeGPUIndices []int RunNCCL bool + ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially } + type NvidiaBenchmarkResult struct { BenchmarkVersion string `json:"benchmark_version"` GeneratedAt time.Time `json:"generated_at"` Hostname string `json:"hostname,omitempty"` + ServerModel string `json:"server_model,omitempty"` BenchmarkProfile string `json:"benchmark_profile"` + ParallelGPUs bool `json:"parallel_gpus,omitempty"` OverallStatus string `json:"overall_status"` SelectedGPUIndices []int `json:"selected_gpu_indices"` Findings []string `json:"findings,omitempty"` diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 2fe53bd..927f5ad 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -470,6 +470,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req GPUIndices []int `json:"gpu_indices"` ExcludeGPUIndices []int `json:"exclude_gpu_indices"` RunNCCL *bool `json:"run_nccl"` + ParallelGPUs *bool `json:"parallel_gpus"` DisplayName string `json:"display_name"` } if r.Body != nil { @@ -483,6 +484,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req if body.RunNCCL != nil { runNCCL = *body.RunNCCL } + parallelGPUs := false + if body.ParallelGPUs != nil { + parallelGPUs = *body.ParallelGPUs + } name := taskDisplayName("nvidia-benchmark", "", "") if strings.TrimSpace(body.DisplayName) != "" { name = body.DisplayName @@ -493,6 +498,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req SizeMB: body.SizeMB, BenchmarkProfile: body.Profile, RunNCCL: runNCCL, + ParallelGPUs: parallelGPUs, DisplayName: body.DisplayName, }, name, h.opts.App, "benchmark-nvidia") if err != nil { diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 69ed8a3..8e82fd8 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -1625,6 +1625,10 @@ func renderBenchmark(opts HandlerOptions) string {

Loading NVIDIA GPUs...

+