Files
bee/audit/internal/platform/benchmark.go
Mikhail Chusavitin 93cfa78e8c Benchmark: parallel GPU mode, resilient inventory query, server model in results
- Add parallel GPU mode (checkbox, off by default): runs all selected GPUs
  simultaneously via a single bee-gpu-burn invocation instead of sequentially;
  per-GPU telemetry, throttle counters, TOPS, and scoring are preserved
- Make queryBenchmarkGPUInfo resilient: falls back to a base field set when
  extended fields (attribute.multiprocessor_count, power.default_limit) cause
  exit status 2, preventing lgc normalization from being silently skipped
- Log explicit "graphics clock lock skipped" note when inventory is unavailable
- Collect server model from DMI (/sys/class/dmi/id/product_name) and store in
  result JSON; benchmark history columns now show "Server Model (N× GPU Model)"
  grouped by server+GPU type rather than individual GPU index

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-07 18:32:15 +03:00

1494 lines
49 KiB
Go

package platform
import (
"context"
"encoding/csv"
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
)
const benchmarkVersion = "1"
type benchmarkProfileSpec struct {
Name string
BaselineSec int
WarmupSec int
SteadySec int
NCCLSec int
CooldownSec int
}
type benchmarkGPUInfo struct {
Index int
UUID string
Name string
BusID string
VBIOS string
PowerLimitW float64
DefaultPowerLimitW float64
MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64
MultiprocessorCount int
}
type benchmarkBurnProfile struct {
name string
category string
supported bool
lanes int
m uint64
n uint64
k uint64
iterations uint64
notes string
}
type benchmarkBurnParseResult struct {
Device string
ComputeCapability string
Backend string
DurationSec int
Profiles []BenchmarkPrecisionResult
Fallback bool
}
type benchmarkRestoreAction struct {
name string
fn func()
}
var (
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
)
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-benchmark"
}
spec := resolveBenchmarkProfile(opts.Profile)
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs selected")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
hostname, _ := os.Hostname()
result := NvidiaBenchmarkResult{
BenchmarkVersion: benchmarkVersion,
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkProfile: spec.Name,
ParallelGPUs: opts.ParallelGPUs,
SelectedGPUIndices: append([]int(nil), selected...),
Normalization: BenchmarkNormalization{
Status: "full",
},
}
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
// Server power characterization state — populated during per-GPU phases.
var serverIdleW, serverLoadedWSum float64
var serverIdleOK, serverLoadedOK bool
var serverLoadedSamples int
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
result.Normalization.Status = "partial"
}
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
activeApps, err := queryActiveComputeApps(selected)
if err == nil && len(activeApps) > 0 {
result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
result.Normalization.Status = "partial"
}
restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
}
}()
if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
} else {
for _, idx := range selected {
gpuResult := BenchmarkGPUResult{
Index: idx,
Status: "FAILED",
}
if info, ok := infoByIndex[idx]; ok {
gpuResult.UUID = info.UUID
gpuResult.Name = info.Name
gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
}
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc)
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
beforeThrottle, _ := queryThrottleCounters(idx)
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.SteadySec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
}
parseResult := parseBenchmarkBurnLog(string(steadyOut))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
if parseResult.Fallback {
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows)
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
}
} // end sequential path
if len(selected) > 1 && opts.RunNCCL {
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
if result.Interconnect != nil && result.Interconnect.Supported {
for i := range result.GPUs {
result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
}
}
}
// Compute server power characterization from accumulated IPMI samples.
var gpuReportedSumW float64
for _, gpu := range result.GPUs {
gpuReportedSumW += gpu.Steady.AvgPowerW
}
var serverLoadedW float64
if serverLoadedSamples > 0 {
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
}
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal benchmark result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return "", fmt.Errorf("write result.json: %w", err)
}
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
return "", fmt.Errorf("write report.txt: %w", err)
}
summary := renderBenchmarkSummary(result)
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
return "", fmt.Errorf("write summary.txt: %w", err)
}
archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
if err := createTarGz(archive, runDir); err != nil {
return "", fmt.Errorf("pack benchmark archive: %w", err)
}
return archive, nil
}
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
case NvidiaBenchmarkProfileStability:
opts.Profile = NvidiaBenchmarkProfileStability
case NvidiaBenchmarkProfileOvernight:
opts.Profile = NvidiaBenchmarkProfileOvernight
default:
opts.Profile = NvidiaBenchmarkProfileStandard
}
if opts.SizeMB < 0 {
opts.SizeMB = 0
}
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
return opts
}
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}
case NvidiaBenchmarkProfileOvernight:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}
default:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}
}
}
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
// Fields are tried in order; the first successful query wins. Extended fields
// (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails.
var benchmarkGPUInfoQueries = []struct {
fields string
extended bool // whether this query includes optional extended fields
}{
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
extended: true,
},
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
extended: false,
},
}
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
var lastErr error
for _, q := range benchmarkGPUInfoQueries {
args := []string{
"--query-gpu=" + q.fields,
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
}
out, err := satExecCommand("nvidia-smi", args...).Output()
if err != nil {
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
continue
}
r := csv.NewReader(strings.NewReader(string(out)))
r.TrimLeadingSpace = true
r.FieldsPerRecord = -1
rows, err := r.ReadAll()
if err != nil {
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
continue
}
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
for _, row := range rows {
if len(row) < 9 {
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
if err != nil {
continue
}
info := benchmarkGPUInfo{
Index: idx,
UUID: strings.TrimSpace(row[1]),
Name: strings.TrimSpace(row[2]),
BusID: strings.TrimSpace(row[3]),
VBIOS: strings.TrimSpace(row[4]),
PowerLimitW: parseBenchmarkFloat(row[5]),
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
}
if len(row) >= 9 {
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
}
if q.extended {
if len(row) >= 10 {
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
}
if len(row) >= 11 {
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
}
}
infoByIndex[idx] = info
}
return infoByIndex, nil
}
return nil, lastErr
}
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
if os.Geteuid() != 0 {
result.Normalization.Status = "partial"
result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
for _, idx := range gpuIndices {
result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
Index: idx,
Notes: []string{"normalization skipped: root privileges are required"},
})
}
return nil
}
var restore []benchmarkRestoreAction
for _, idx := range gpuIndices {
rec := BenchmarkNormalizationGPU{Index: idx}
if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
rec.PersistenceMode = "failed"
rec.Notes = append(rec.Notes, "failed to enable persistence mode")
result.Normalization.Status = "partial"
} else {
rec.PersistenceMode = "applied"
}
if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
target := int(math.Round(info.MaxGraphicsClockMHz))
if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
rec.GPUClockLockStatus = "failed"
rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
result.Normalization.Status = "partial"
} else {
rec.GPUClockLockStatus = "applied"
rec.GPUClockLockMHz = float64(target)
idxCopy := idx
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
}})
}
} else {
rec.GPUClockLockStatus = "skipped"
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
result.Normalization.Status = "partial"
}
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
target := int(math.Round(info.MaxMemoryClockMHz))
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
switch {
case err == nil:
rec.MemoryClockLockStatus = "applied"
rec.MemoryClockLockMHz = float64(target)
idxCopy := idx
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
}})
case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
rec.MemoryClockLockStatus = "unsupported"
rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
result.Normalization.Status = "partial"
default:
rec.MemoryClockLockStatus = "failed"
rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
result.Normalization.Status = "partial"
}
}
result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
}
return restore
}
func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
if durationSec <= 0 {
return nil, nil
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var rows []GPUMetricRow
start := time.Now()
for {
if ctx.Err() != nil {
return rows, ctx.Err()
}
samples, err := sampleGPUMetrics(gpuIndices)
if err == nil {
elapsed := time.Since(start).Seconds()
for i := range samples {
samples[i].ElapsedSec = elapsed
}
rows = append(rows, samples...)
}
if time.Now().After(deadline) {
break
}
select {
case <-ctx.Done():
return rows, ctx.Err()
case <-time.After(time.Second):
}
}
return rows, nil
}
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
stopCh := make(chan struct{})
doneCh := make(chan struct{})
var metricRows []GPUMetricRow
start := time.Now()
go func() {
defer close(doneCh)
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
return
case <-ticker.C:
samples, err := sampleGPUMetrics(gpuIndices)
if err != nil {
continue
}
elapsed := time.Since(start).Seconds()
for i := range samples {
samples[i].ElapsedSec = elapsed
}
metricRows = append(metricRows, samples...)
}
}
}()
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
close(stopCh)
<-doneCh
writeBenchmarkMetricsFiles(runDir, baseName, metricRows)
return out, metricRows, err
}
func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) {
if len(rows) == 0 {
return
}
_ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows)
chart := RenderGPUTerminalChart(rows)
_ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644)
}
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
result := benchmarkBurnParseResult{}
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
profiles := make(map[string]*benchmarkBurnProfile)
for _, line := range lines {
line = stripBenchmarkPrefix(strings.TrimSpace(line))
if line == "" {
continue
}
switch {
case strings.HasPrefix(line, "device="):
result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
case strings.HasPrefix(line, "compute_capability="):
result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
case strings.HasPrefix(line, "backend="):
result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
result.Fallback = result.Backend == "driver-ptx"
case strings.HasPrefix(line, "duration_s="):
result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
default:
if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
profile := ensureBenchmarkProfile(profiles, m[1])
profile.supported = true
profile.lanes++
profile.m, _ = strconv.ParseUint(m[3], 10, 64)
profile.n, _ = strconv.ParseUint(m[4], 10, 64)
profile.k, _ = strconv.ParseUint(m[5], 10, 64)
continue
}
if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
profile := ensureBenchmarkProfile(profiles, m[1])
profile.supported = false
profile.notes = strings.TrimSpace(m[2])
continue
}
if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
profile := ensureBenchmarkProfile(profiles, m[1])
iters, _ := strconv.ParseUint(m[2], 10, 64)
profile.iterations += iters
}
}
}
keys := make([]string, 0, len(profiles))
for key := range profiles {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
profile := profiles[key]
precision := BenchmarkPrecisionResult{
Name: profile.name,
Category: profile.category,
Supported: profile.supported,
Lanes: profile.lanes,
M: profile.m,
N: profile.n,
K: profile.k,
Iterations: profile.iterations,
Notes: profile.notes,
}
if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
}
result.Profiles = append(result.Profiles, precision)
}
return result
}
func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
if profile, ok := profiles[name]; ok {
return profile
}
category := "other"
switch {
case strings.HasPrefix(name, "fp64"):
category = "fp64"
case strings.HasPrefix(name, "fp32"):
category = "fp32_tf32"
case strings.HasPrefix(name, "fp16"):
category = "fp16_bf16"
case strings.HasPrefix(name, "fp8"):
category = "fp8"
case strings.HasPrefix(name, "fp4"):
category = "fp4"
}
profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
profiles[name] = profile
return profile
}
func stripBenchmarkPrefix(line string) string {
if strings.HasPrefix(line, "[gpu ") {
if idx := strings.Index(line, "] "); idx >= 0 {
return line[idx+2:]
}
}
return line
}
func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
summary := BenchmarkTelemetrySummary{}
if len(rows) == 0 {
return summary
}
temps := make([]float64, 0, len(rows))
powers := make([]float64, 0, len(rows))
clocks := make([]float64, 0, len(rows))
memClocks := make([]float64, 0, len(rows))
usages := make([]float64, 0, len(rows))
memUsages := make([]float64, 0, len(rows))
summary.DurationSec = rows[len(rows)-1].ElapsedSec
summary.Samples = len(rows)
for _, row := range rows {
temps = append(temps, row.TempC)
powers = append(powers, row.PowerW)
clocks = append(clocks, row.ClockMHz)
memClocks = append(memClocks, row.MemClockMHz)
usages = append(usages, row.UsagePct)
memUsages = append(memUsages, row.MemUsagePct)
}
summary.AvgTempC = benchmarkMean(temps)
summary.P95TempC = benchmarkPercentile(temps, 95)
summary.AvgPowerW = benchmarkMean(powers)
summary.P95PowerW = benchmarkPercentile(powers, 95)
summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
summary.AvgUsagePct = benchmarkMean(usages)
summary.AvgMemUsagePct = benchmarkMean(memUsages)
summary.ClockCVPct = benchmarkCV(clocks)
summary.PowerCVPct = benchmarkCV(powers)
summary.TempCVPct = benchmarkCV(temps)
summary.ClockDriftPct = benchmarkClockDrift(clocks)
return summary
}
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score := BenchmarkScorecard{}
for _, precision := range gpu.PrecisionResults {
if precision.Supported {
score.ComputeScore += precision.TeraOpsPerSec
}
}
// Use default power limit for sustain score so a manually reduced limit
// does not inflate the score. Fall back to enforced limit if default unknown.
referencePowerW := gpu.DefaultPowerLimitW
if referencePowerW <= 0 {
referencePowerW = gpu.PowerLimitW
}
if referencePowerW > 0 {
score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
}
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
score.CompositeScore = compositeBenchmarkScore(score)
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
}
return score
}
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
if score.InterconnectScore > 0 {
quality += 0.10
}
if quality > 1.10 {
quality = 1.10
}
return score.ComputeScore * quality
}
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
var reasons []string
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
reasons = append(reasons, "power_capped")
}
if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
reasons = append(reasons, "thermal_limited")
}
if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
reasons = append(reasons, "sync_boost_limited")
}
if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
reasons = append(reasons, "low_sm_clock_vs_target")
}
if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
reasons = append(reasons, "variance_too_high")
}
if normalizationStatus != "full" {
reasons = append(reasons, "normalization_partial")
}
return dedupeStrings(reasons)
}
func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
result := &BenchmarkInterconnectResult{
Status: "UNSUPPORTED",
Attempted: true,
SelectedGPUIndices: append([]int(nil), gpuIndices...),
}
cmd := []string{
"all_reduce_perf",
"-b", "512M",
"-e", "4G",
"-f", "2",
"-g", strconv.Itoa(len(gpuIndices)),
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
}
env := []string{
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
}
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
if err != nil {
result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
return result
}
avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
result.Status = "OK"
result.Supported = true
result.AvgAlgBWGBps = avgAlg
result.MaxAlgBWGBps = maxAlg
result.AvgBusBWGBps = avgBus
result.MaxBusBWGBps = maxBus
return result
}
func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
var algs []float64
var buses []float64
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
fields := strings.Fields(line)
if len(fields) < 8 {
continue
}
for i := 0; i+2 < len(fields); i++ {
timeVal, err1 := strconv.ParseFloat(fields[i], 64)
algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
algs = append(algs, algVal)
buses = append(buses, busVal)
break
}
}
}
if len(algs) == 0 {
return 0, 0, 0, 0
}
return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
}
func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
out, err := satExecCommand(
"nvidia-smi",
"--id="+strconv.Itoa(gpuIndex),
"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return BenchmarkThrottleCounters{}, err
}
fields := strings.Split(strings.TrimSpace(string(out)), ",")
if len(fields) < 5 {
return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
}
return BenchmarkThrottleCounters{
SWPowerCapUS: parseBenchmarkUint64(fields[0]),
SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]),
SyncBoostUS: parseBenchmarkUint64(fields[2]),
HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]),
HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
}, nil
}
func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
return BenchmarkThrottleCounters{
SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
}
}
func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
args := []string{
"--query-compute-apps=gpu_uuid,pid,process_name",
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
}
out, err := satExecCommand("nvidia-smi", args...).Output()
if err != nil {
return nil, err
}
var lines []string
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
lines = append(lines, line)
}
return lines, nil
}
func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
if gpu.Status == "" {
gpu.Status = "OK"
}
if gpu.Scores.CompositeScore == 0 {
gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
}
return gpu
}
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
var findings []string
passed := 0
for _, gpu := range result.GPUs {
if gpu.Status == "OK" {
passed++
}
}
total := len(result.GPUs)
if total > 0 {
if passed == total {
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
} else {
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
}
}
if result.Normalization.Status != "full" {
findings = append(findings, "Environment normalization was partial; compare results with caution.")
}
for _, gpu := range result.GPUs {
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
continue
}
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
continue
}
for _, reason := range gpu.DegradationReasons {
switch reason {
case "power_capped":
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
case "thermal_limited":
findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
case "sync_boost_limited":
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
case "low_sm_clock_vs_target":
findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
case "variance_too_high":
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
case "normalization_partial":
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
}
}
if gpu.Backend == "driver-ptx" {
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
}
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
findings = append(findings, fmt.Sprintf(
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
))
}
}
if result.Interconnect != nil && result.Interconnect.Supported {
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
}
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
if sp.ReportingRatio < 0.75 {
findings = append(findings, fmt.Sprintf(
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
))
}
}
return dedupeStrings(findings)
}
func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
if len(result.GPUs) == 0 {
return "FAILED"
}
hasOK := false
hasPartial := result.Normalization.Status != "full"
for _, gpu := range result.GPUs {
switch gpu.Status {
case "OK":
hasOK = true
case "PARTIAL", "UNSUPPORTED":
hasPartial = true
}
}
if !hasOK {
return "FAILED"
}
if hasPartial {
return "PARTIAL"
}
return "OK"
}
func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
for i := range items {
if items[i].Index == idx {
return &items[i]
}
}
return nil
}
func classifySATErrorStatus(out []byte, err error) string {
status, _ := classifySATResult("benchmark", out, err)
if status == "UNSUPPORTED" {
return "UNSUPPORTED"
}
return "FAILED"
}
func parseBenchmarkFloat(raw string) float64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
return 0
}
value, _ := strconv.ParseFloat(raw, 64)
return value
}
func parseBenchmarkUint64(raw string) uint64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
return 0
}
value, _ := strconv.ParseUint(raw, 10, 64)
return value
}
func benchmarkMean(values []float64) float64 {
if len(values) == 0 {
return 0
}
var sum float64
for _, value := range values {
sum += value
}
return sum / float64(len(values))
}
func benchmarkPercentile(values []float64, p float64) float64 {
if len(values) == 0 {
return 0
}
copyValues := append([]float64(nil), values...)
sort.Float64s(copyValues)
if len(copyValues) == 1 {
return copyValues[0]
}
rank := (p / 100.0) * float64(len(copyValues)-1)
lower := int(math.Floor(rank))
upper := int(math.Ceil(rank))
if lower == upper {
return copyValues[lower]
}
frac := rank - float64(lower)
return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
}
func benchmarkCV(values []float64) float64 {
if len(values) == 0 {
return 0
}
mean := benchmarkMean(values)
if mean == 0 {
return 0
}
var variance float64
for _, value := range values {
diff := value - mean
variance += diff * diff
}
variance /= float64(len(values))
return math.Sqrt(variance) / mean * 100
}
func benchmarkClockDrift(values []float64) float64 {
if len(values) < 4 {
return 0
}
window := len(values) / 4
if window < 1 {
window = 1
}
head := benchmarkMean(values[:window])
tail := benchmarkMean(values[len(values)-window:])
if head <= 0 || tail >= head {
return 0
}
return ((head - tail) / head) * 100
}
func benchmarkMax(values []float64) float64 {
var max float64
for i, value := range values {
if i == 0 || value > max {
max = value
}
}
return max
}
func clampScore(value float64) float64 {
switch {
case value < 0:
return 0
case value > 100:
return 100
default:
return value
}
}
func dedupeStrings(values []string) []string {
if len(values) == 0 {
return nil
}
seen := make(map[string]struct{}, len(values))
out := make([]string, 0, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
return out
}
func saturatingSub(after, before uint64) uint64 {
if after <= before {
return 0
}
return after - before
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
if err != nil {
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
}
for _, line := range strings.Split(string(out), "\n") {
if strings.Contains(line, "Current Power") {
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
val = strings.TrimSpace(val)
w, err := strconv.ParseFloat(val, 64)
if err == nil && w > 0 {
return w, nil
}
}
}
}
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
}
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
// durationSec seconds. Returns the mean of all successful samples.
// Returns 0, false if IPMI is unavailable.
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
if durationSec <= 0 {
return 0, false
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var samples []float64
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
if time.Now().After(deadline) {
break
}
select {
case <-ctx.Done():
break
case <-time.After(2 * time.Second):
}
}
if len(samples) == 0 {
return 0, false
}
var sum float64
for _, w := range samples {
sum += w
}
return sum / float64(len(samples)), true
}
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
// IPMI samples plus the GPU-reported average power during steady state.
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
sp := &BenchmarkServerPower{Available: ipmiAvailable}
if !ipmiAvailable {
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
return sp
}
sp.IdleW = idleW
sp.LoadedW = loadedW
sp.DeltaW = loadedW - idleW
sp.GPUReportedSumW = gpuReportedSumW
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
}
return sp
}
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
// Returns empty string if unavailable (non-Linux or missing DMI entry).
func readServerModel() string {
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}
// filterRowsByGPU returns only the metric rows for a specific GPU index.
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
var out []GPUMetricRow
for _, r := range rows {
if r.GPUIndex == gpuIndex {
out = append(out, r)
}
}
return out
}
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
// and returns a per-GPU parse result map.
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
gpuLines := make(map[int][]string)
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "[gpu ") {
continue
}
end := strings.Index(line, "] ")
if end < 0 {
continue
}
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
if err != nil {
continue
}
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
}
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
for gpuIdx, lines := range gpuLines {
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
}
return results
}
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
// simultaneously using a single bee-gpu-burn invocation per phase.
func runNvidiaBenchmarkParallel(
ctx context.Context,
verboseLog, runDir string,
selected []int,
infoByIndex map[int]benchmarkGPUInfo,
opts NvidiaBenchmarkOptions,
spec benchmarkProfileSpec,
logFunc func(string),
result *NvidiaBenchmarkResult,
serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
) {
allDevices := joinIndexList(selected)
// Build per-GPU result stubs.
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
for _, idx := range selected {
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
if info, ok := infoByIndex[idx]; ok {
r.UUID = info.UUID
r.Name = info.Name
r.BusID = info.BusID
r.VBIOS = info.VBIOS
r.PowerLimitW = info.PowerLimitW
r.MultiprocessorCount = info.MultiprocessorCount
r.DefaultPowerLimitW = info.DefaultPowerLimitW
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
gpuResults[idx] = r
}
// Baseline: sample all GPUs together.
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
}
}
for _, idx := range selected {
perGPU := filterRowsByGPU(baselineRows, idx)
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
}
// Sample server idle power once.
if !*serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
*serverIdleW = w
*serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
// Warmup: all GPUs simultaneously.
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
for _, idx := range selected {
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
}
if warmupErr != nil {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
}
}
// Snapshot throttle counters before steady.
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
beforeThrottle[idx], _ = queryThrottleCounters(idx)
}
// Steady: all GPUs simultaneously.
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.SteadySec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
// Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW
(*serverLoadedSamples)++
*serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
}
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
afterThrottle[idx], _ = queryThrottleCounters(idx)
}
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
if pr, ok := parseResults[idx]; ok {
gpuResults[idx].ComputeCapability = pr.ComputeCapability
gpuResults[idx].Backend = pr.Backend
gpuResults[idx].PrecisionResults = pr.Profiles
if pr.Fallback {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
}
if steadyErr != nil {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
}
}
// Cooldown: all GPUs together.
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
}
}
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
}
// Score and finalize each GPU.
for _, idx := range selected {
r := gpuResults[idx]
r.Scores = scoreBenchmarkGPUResult(*r)
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
pr := parseResults[idx]
switch {
case steadyErr != nil:
r.Status = classifySATErrorStatus(steadyOut, steadyErr)
case pr.Fallback:
r.Status = "PARTIAL"
default:
r.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
}
}