Files
bee/audit/internal/platform/benchmark.go
Michael Chus a636146dbd Fix power calibration failing due to DCGM resource contention
When a targeted_power attempt is cancelled (e.g. after sw_thermal
throttle), nv-hostengine holds the diagnostic slot asynchronously.
The next attempt immediately received DCGM_ST_IN_USE (exit 222)
and incorrectly derated the power limit.

Now: exit 222 is detected via isDCGMResourceBusy and triggers an
exponential back-off retry at the same power limit (1s, 2s, 4s, …
up to 256s). Once the back-off delay would exceed 300s the
calibration fails, indicating the slot is persistently held.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 20:41:17 +03:00

2927 lines
101 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package platform
import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
)
const benchmarkVersion = "2"
type benchmarkProfileSpec struct {
Name string
BaselineSec int
WarmupSec int
SteadySec int
NCCLSec int
CooldownSec int
}
type benchmarkGPUInfo struct {
Index int
UUID string
Name string
BusID string
VBIOS string
PowerLimitW float64
DefaultPowerLimitW float64
MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64
MultiprocessorCount int
}
type benchmarkPowerCalibrationResult struct {
Summary BenchmarkTelemetrySummary
AppliedPowerLimitW float64
Attempts int
Derated bool
Completed bool
Notes []string
}
type benchmarkBurnProfile struct {
name string
category string
supported bool
lanes int
m uint64
n uint64
k uint64
iterations uint64
notes string
}
type benchmarkBurnParseResult struct {
Device string
ComputeCapability string
Backend string
DurationSec int
Profiles []BenchmarkPrecisionResult
Fallback bool
}
type benchmarkRestoreAction struct {
name string
fn func()
}
var (
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
)
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
func computeCapabilityCode(raw string) int {
raw = strings.TrimSpace(raw)
if raw == "" {
return 0
}
parts := strings.SplitN(raw, ".", 2)
major, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
minor := 0
if len(parts) > 1 {
minor, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
}
return major*10 + minor
}
func benchmarkSupportedPrecisions(computeCapability string) []string {
cc := computeCapabilityCode(computeCapability)
out := make([]string, 0, len(benchmarkPrecisionPhases))
for _, prec := range benchmarkPrecisionPhases {
if prec == "fp4" && cc > 0 && cc < 100 {
continue
}
out = append(out, prec)
}
return out
}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
if len(precisions) == 0 {
precisions = append([]string(nil), benchmarkPrecisionPhases...)
}
switch spec.Name {
case NvidiaBenchmarkProfileStandard:
basePhaseSec = 60
mixedPhaseSec = 300
case NvidiaBenchmarkProfileStability:
basePhaseSec = 300
mixedPhaseSec = 3600
case NvidiaBenchmarkProfileOvernight:
basePhaseSec = 3600
mixedPhaseSec = 14400
default:
totalWeight := len(precisions) + 5
if totalWeight <= 0 {
return nil, nil, 0, 0
}
basePhaseSec = spec.SteadySec / totalWeight
if basePhaseSec <= 0 {
basePhaseSec = 1
}
mixedPhaseSec = basePhaseSec * 5
}
planLabels = make([]string, 0, len(precisions)+1)
planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1)
for _, prec := range precisions {
planLabels = append(planLabels, prec)
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: prec,
MetricStage: metricStage(prec),
DurationSec: basePhaseSec,
})
}
planLabels = append(planLabels, "mixed")
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: "mixed",
MetricStage: metricStage("mixed"),
DurationSec: mixedPhaseSec,
})
return planLabels, planPhases, basePhaseSec, mixedPhaseSec
}
func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
values := make([]string, 0, len(phases))
for _, phase := range phases {
values = append(values, strconv.Itoa(phase.DurationSec))
}
return strings.Join(values, ",")
}
func benchmarkPlannedPhaseStatus(raw []byte) (string, string) {
text := strings.ToLower(strings.TrimSpace(string(raw)))
switch {
case text == "":
return "FAILED", "phase produced no output"
case strings.Contains(text, "phase_error="):
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") {
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
}
return "FAILED", "precision phase failed"
case strings.Contains(text, "status=failed"):
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") {
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
}
return "FAILED", "precision phase failed"
default:
return "OK", ""
}
}
func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string {
diff := diffThrottleCounters(before, after)
switch {
case diff.HWThermalSlowdownUS > 0:
return "hw_thermal"
case diff.SWThermalSlowdownUS > 0:
return "sw_thermal"
case diff.HWPowerBrakeSlowdownUS > 0:
return "hw_power_brake"
default:
return ""
}
}
func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error {
if powerLimitW <= 0 {
return fmt.Errorf("invalid power limit %d", powerLimitW)
}
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{
"nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW),
}, nil, nil)
if err != nil {
return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out)))
}
return nil
}
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-bench/perf"
}
spec := resolveBenchmarkProfile(opts.Profile)
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs selected")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "perf-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
hostname, _ := os.Hostname()
result := NvidiaBenchmarkResult{
BenchmarkVersion: benchmarkVersion,
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkProfile: spec.Name,
ParallelGPUs: opts.ParallelGPUs,
RampStep: opts.RampStep,
RampTotal: opts.RampTotal,
RampRunID: opts.RampRunID,
SelectedGPUIndices: append([]int(nil), selected...),
HostConfig: readBenchmarkHostConfig(),
Normalization: BenchmarkNormalization{
Status: "full",
},
}
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
var metricRows []GPUMetricRow
metricTimelineSec := 0.0
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
// Server power characterization state — populated during per-GPU phases.
var serverIdleW, serverLoadedWSum float64
var serverIdleOK, serverLoadedOK bool
var serverLoadedSamples int
// Run nvidia-smi -q first: used both for the log file and as a fallback
// source of max clock values when CSV clock fields are unsupported.
var nvsmiQOut []byte
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
nvsmiQOut = out
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
result.Normalization.Status = "partial"
}
// Enrich with max clocks from verbose output — covers GPUs where
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
activeApps, err := queryActiveComputeApps(selected)
if err == nil && len(activeApps) > 0 {
result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
result.Normalization.Status = "partial"
}
restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
}
}()
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
restoreActions = append(restoreActions, powerRestoreActions...)
for _, idx := range selected {
if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 {
result.Warnings = append(result.Warnings, fmt.Sprintf(
"GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.",
idx, calib.AppliedPowerLimitW,
))
}
}
// Start background CPU load sampler — samples every 10s during GPU phases.
cpuStopCh := make(chan struct{})
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog)
} else {
for _, idx := range selected {
gpuResult := BenchmarkGPUResult{
Index: idx,
Status: "FAILED",
}
if info, ok := infoByIndex[idx]; ok {
gpuResult.UUID = info.UUID
gpuResult.Name = info.Name
gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if calib, ok := calibByIndex[idx]; ok {
gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW
gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC
gpuResult.PowerCalibrationTries = calib.Attempts
gpuResult.PowerLimitDerated = calib.Derated
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
}
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec))
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
warmupParse := parseBenchmarkBurnLog(string(warmupOut))
if gpuResult.ComputeCapability == "" {
gpuResult.ComputeCapability = warmupParse.ComputeCapability
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPU stays hot between windows.
eccBase, _ := queryECCCounters(idx)
supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability)
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
if label == "mixed" {
return fmt.Sprintf("gpu-%d-steady", idx)
}
return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
})
planCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(basePhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range supportedPrecisions {
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
phaseRows := phaseRowsByStage[stageName]
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Status: "OK",
Steady: summarizeBenchmarkTelemetry(phaseRows),
}
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
phase.Status = status
phase.Notes = note
gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status)
}
for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
}
beforeThrottle, _ := queryThrottleCounters(idx)
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
afterThrottle, _ := queryThrottleCounters(idx)
if planErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
}
steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
if parseResult.Fallback {
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
if spec.CooldownSec > 0 {
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
}
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if planErr != nil {
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if len(gpuResult.PrecisionFailures) > 0 {
gpuResult.Status = "PARTIAL"
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
}
} // end sequential path
if len(selected) > 1 && opts.RunNCCL {
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
if result.Interconnect != nil && result.Interconnect.Supported {
for i := range result.GPUs {
result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
}
}
}
// Stop CPU load sampler and attach results.
close(cpuStopCh)
if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
result.CPULoad = summarizeCPULoad(cpuSamples)
if result.CPULoad != nil && result.CPULoad.Status != "ok" {
logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
}
}
// Compute server power characterization from accumulated IPMI samples.
var gpuReportedSumW float64
for _, gpu := range result.GPUs {
gpuReportedSumW += gpu.Steady.AvgPowerW
}
var serverLoadedW float64
if serverLoadedSamples > 0 {
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
}
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
result.Cooling = summarizeBenchmarkCooling(metricRows)
// Apply server-power penalty when IPMI reports the server delta is much
// lower than GPU-reported sum: GPU power telemetry is over-stated, making
// CalibratedPeakPowerW and PowerSustainScore unreliable.
// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
factor := sp.ReportingRatio / 0.75
for i := range result.GPUs {
result.GPUs[i].Scores.CompositeScore *= factor
result.GPUs[i].Notes = append(result.GPUs[i].Notes,
fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
sp.ReportingRatio, factor*100))
}
}
result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result)
writeBenchmarkMetricsFiles(runDir, metricRows)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal benchmark result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return "", fmt.Errorf("write result.json: %w", err)
}
report := renderBenchmarkReportWithCharts(result)
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err)
}
summary := renderBenchmarkSummary(result)
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
return "", fmt.Errorf("write summary.txt: %w", err)
}
return runDir, nil
}
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
case NvidiaBenchmarkProfileStability:
opts.Profile = NvidiaBenchmarkProfileStability
case NvidiaBenchmarkProfileOvernight:
opts.Profile = NvidiaBenchmarkProfileOvernight
default:
opts.Profile = NvidiaBenchmarkProfileStandard
}
if opts.SizeMB < 0 {
opts.SizeMB = 0
}
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
return opts
}
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
case NvidiaBenchmarkProfileOvernight:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
default:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
}
}
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
// Fields are tried in order; the first successful query wins. Extended fields
// (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails.
// The minimal fallback omits clock fields entirely — clocks.max.* returns
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
var benchmarkGPUInfoQueries = []struct {
fields string
extended bool // whether this query includes optional extended fields
minimal bool // clock fields omitted; max clocks must be filled separately
}{
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
extended: true,
},
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
extended: false,
},
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit",
minimal: true,
},
}
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
// any GPU in infoByIndex where those values are still zero. It parses the
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
// return exit status 2 but the verbose query works fine.
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
return
}
// Build bus_id → index map for matching verbose sections to GPU indices.
busToBenchIdx := make(map[string]int, len(infoByIndex))
for idx, info := range infoByIndex {
if info.BusID != "" {
// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
// while --query-gpu returns the same format; normalise to lower.
busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
}
}
// Split the verbose output into per-GPU sections on "^GPU " lines.
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
for i, loc := range sectionStarts {
busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
benchIdx, ok := busToBenchIdx[busID]
if !ok {
// Bus IDs from verbose output may have a different domain prefix;
// try suffix match on the slot portion (XX:XX.X).
for k, v := range busToBenchIdx {
if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
benchIdx = v
ok = true
break
}
}
}
if !ok {
continue
}
end := len(nvsmiQ)
if i+1 < len(sectionStarts) {
end = sectionStarts[i+1][0]
}
section := nvsmiQ[loc[0]:end]
info := infoByIndex[benchIdx]
if info.MaxGraphicsClockMHz == 0 {
if m := maxGfxRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
info.MaxGraphicsClockMHz = v
}
}
}
if info.MaxMemoryClockMHz == 0 {
if m := maxMemRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
info.MaxMemoryClockMHz = v
}
}
}
if info.DefaultPowerLimitW == 0 {
if m := defaultPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.DefaultPowerLimitW = v
}
}
}
if info.PowerLimitW == 0 {
if m := currentPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.PowerLimitW = v
}
}
}
if info.MultiprocessorCount == 0 {
if m := smCountRe.FindSubmatch(section); m != nil {
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
info.MultiprocessorCount = v
}
}
}
infoByIndex[benchIdx] = info
}
}
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
var lastErr error
for _, q := range benchmarkGPUInfoQueries {
args := []string{
"--query-gpu=" + q.fields,
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
}
out, err := satExecCommand("nvidia-smi", args...).Output()
if err != nil {
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
continue
}
r := csv.NewReader(strings.NewReader(string(out)))
r.TrimLeadingSpace = true
r.FieldsPerRecord = -1
rows, err := r.ReadAll()
if err != nil {
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
continue
}
minFields := 6
if !q.minimal {
minFields = 9
}
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
for _, row := range rows {
if len(row) < minFields {
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
if err != nil {
continue
}
info := benchmarkGPUInfo{
Index: idx,
UUID: strings.TrimSpace(row[1]),
Name: strings.TrimSpace(row[2]),
BusID: strings.TrimSpace(row[3]),
VBIOS: strings.TrimSpace(row[4]),
PowerLimitW: parseBenchmarkFloat(row[5]),
}
if !q.minimal {
info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
if len(row) >= 9 {
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
}
if q.extended {
if len(row) >= 10 {
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
}
if len(row) >= 11 {
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
}
}
}
infoByIndex[idx] = info
}
return infoByIndex, nil
}
return nil, lastErr
}
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
if os.Geteuid() != 0 {
result.Normalization.Status = "partial"
result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
for _, idx := range gpuIndices {
result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
Index: idx,
Notes: []string{"normalization skipped: root privileges are required"},
})
}
return nil
}
var restore []benchmarkRestoreAction
for _, idx := range gpuIndices {
rec := BenchmarkNormalizationGPU{Index: idx}
if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
rec.PersistenceMode = "failed"
rec.Notes = append(rec.Notes, "failed to enable persistence mode")
result.Normalization.Status = "partial"
} else {
rec.PersistenceMode = "applied"
}
if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
target := int(math.Round(info.MaxGraphicsClockMHz))
if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
rec.GPUClockLockStatus = "failed"
rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
result.Normalization.Status = "partial"
} else {
rec.GPUClockLockStatus = "applied"
rec.GPUClockLockMHz = float64(target)
idxCopy := idx
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
}})
}
} else {
rec.GPUClockLockStatus = "skipped"
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
result.Normalization.Status = "partial"
}
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
target := int(math.Round(info.MaxMemoryClockMHz))
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
switch {
case err == nil:
rec.MemoryClockLockStatus = "applied"
rec.MemoryClockLockMHz = float64(target)
idxCopy := idx
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
}})
case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
rec.MemoryClockLockStatus = "unsupported"
rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
result.Normalization.Status = "partial"
default:
rec.MemoryClockLockStatus = "failed"
rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
result.Normalization.Status = "partial"
}
}
result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
}
return restore
}
func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
if durationSec <= 0 {
return nil, nil
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var rows []GPUMetricRow
start := time.Now()
for {
if ctx.Err() != nil {
return rows, ctx.Err()
}
samples, err := sampleBenchmarkTelemetry(gpuIndices)
if err == nil {
elapsed := time.Since(start).Seconds()
for i := range samples {
samples[i].ElapsedSec = elapsed
}
rows = append(rows, samples...)
}
if time.Now().After(deadline) {
break
}
select {
case <-ctx.Done():
return rows, ctx.Err()
case <-time.After(time.Second):
}
}
return rows, nil
}
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
stopCh := make(chan struct{})
doneCh := make(chan struct{})
var metricRows []GPUMetricRow
start := time.Now()
go func() {
defer close(doneCh)
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
return
case <-ticker.C:
samples, err := sampleBenchmarkTelemetry(gpuIndices)
if err != nil {
continue
}
elapsed := time.Since(start).Seconds()
for i := range samples {
samples[i].ElapsedSec = elapsed
}
metricRows = append(metricRows, samples...)
}
}
}()
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
close(stopCh)
<-doneCh
return out, metricRows, err
}
type benchmarkPlannedPhase struct {
PlanLabel string
MetricStage string
DurationSec int
}
func runBenchmarkPlannedCommandWithMetrics(
ctx context.Context,
verboseLog, name string,
cmd []string,
env []string,
gpuIndices []int,
phases []benchmarkPlannedPhase,
logFunc func(string),
) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
}
func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
out := make(map[string][]GPUMetricRow, len(phases))
if len(rows) == 0 || len(phases) == 0 {
return out
}
for _, row := range rows {
idx := len(phases) - 1
var elapsed float64
for i, phase := range phases {
durationSec := phase.DurationSec
if durationSec <= 0 {
durationSec = 1
}
elapsed += float64(durationSec)
if row.ElapsedSec < elapsed {
idx = i
break
}
}
out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
}
return out
}
func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
out := make(map[string][]byte)
var current string
for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
switch {
case strings.HasPrefix(trimmed, "phase_begin="):
current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
case strings.HasPrefix(trimmed, "phase_end="):
current = ""
case current != "":
out[current] = append(out[current], []byte(line+"\n")...)
}
}
return out
}
type benchmarkCoolingSample struct {
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool
}
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples, err := sampleGPUMetrics(gpuIndices)
if err != nil {
return nil, err
}
fanSample := sampleBenchmarkCoolingSample()
for i := range samples {
samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
}
return samples, nil
}
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
return benchmarkCoolingSample{
AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable,
}
}
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow {
if len(rows) == 0 {
return nil
}
stageEnd := offset + durationSec
if stageEnd <= offset {
stageEnd = offset
for _, row := range rows {
if row.ElapsedSec+offset > stageEnd {
stageEnd = row.ElapsedSec + offset
}
}
}
out := make([]GPUMetricRow, len(rows))
for i, row := range rows {
row.Stage = stage
row.ElapsedSec += offset
row.StageStartSec = offset
row.StageEndSec = stageEnd
out[i] = row
}
return out
}
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) {
annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec)
*allRows = append(*allRows, annotated...)
*cursor += durationSec
}
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
if len(rows) == 0 {
return
}
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows)
}
func appendBenchmarkStageLog(path, source, stage string, raw []byte) {
if path == "" || len(raw) == 0 {
return
}
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()
header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage)
_, _ = f.WriteString(header)
if len(raw) > 0 {
_, _ = f.Write(raw)
if raw[len(raw)-1] != '\n' {
_, _ = f.WriteString("\n")
}
}
}
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
result := benchmarkBurnParseResult{}
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
profiles := make(map[string]*benchmarkBurnProfile)
for _, line := range lines {
line = stripBenchmarkPrefix(strings.TrimSpace(line))
if line == "" {
continue
}
switch {
case strings.HasPrefix(line, "device="):
result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
case strings.HasPrefix(line, "compute_capability="):
result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
case strings.HasPrefix(line, "backend="):
result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
result.Fallback = result.Backend == "driver-ptx"
case strings.HasPrefix(line, "duration_s="):
result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
default:
if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
profile := ensureBenchmarkProfile(profiles, m[1])
profile.supported = true
profile.lanes++
profile.m, _ = strconv.ParseUint(m[3], 10, 64)
profile.n, _ = strconv.ParseUint(m[4], 10, 64)
profile.k, _ = strconv.ParseUint(m[5], 10, 64)
continue
}
if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
profile := ensureBenchmarkProfile(profiles, m[1])
profile.supported = false
profile.notes = strings.TrimSpace(m[2])
continue
}
if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
profile := ensureBenchmarkProfile(profiles, m[1])
iters, _ := strconv.ParseUint(m[2], 10, 64)
profile.iterations += iters
}
}
}
keys := make([]string, 0, len(profiles))
for key := range profiles {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
profile := profiles[key]
precision := BenchmarkPrecisionResult{
Name: profile.name,
Category: profile.category,
Supported: profile.supported,
Lanes: profile.lanes,
M: profile.m,
N: profile.n,
K: profile.k,
Iterations: profile.iterations,
Notes: profile.notes,
}
w := precisionWeight(profile.category)
precision.Weight = w
if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w
}
result.Profiles = append(result.Profiles, precision)
}
return result
}
func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
if profile, ok := profiles[name]; ok {
return profile
}
category := "other"
switch {
case strings.HasPrefix(name, "fp64"):
category = "fp64"
case strings.HasPrefix(name, "fp32"):
category = "fp32_tf32"
case strings.HasPrefix(name, "fp16"):
category = "fp16_bf16"
case strings.HasPrefix(name, "int8"):
category = "int8"
case strings.HasPrefix(name, "fp8"):
category = "fp8"
case strings.HasPrefix(name, "fp4"):
category = "fp4"
}
profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
profiles[name] = profile
return profile
}
// precisionWeight returns the fp32-equivalence factor for a precision category.
// Each factor represents how much "real" numeric work one operation of that
// type performs relative to fp32 (single precision = 1.0 baseline):
//
// fp64 = 2.0 — double precision, 2× more bits per operand
// fp32 = 1.0 — single precision baseline
// fp16 = 0.5 — half precision
// int8 = 0.25 — quarter precision
// fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision
//
// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
// cross-precision comparison on the same numeric scale.
func precisionWeight(category string) float64 {
switch category {
case "fp64":
return 2.0
case "fp32_tf32":
return 1.0
case "fp16_bf16":
return 0.5
case "int8":
return 0.25
case "fp8":
return 0.25
case "fp4":
return 0.125
default:
return 1.0
}
}
func stripBenchmarkPrefix(line string) string {
if strings.HasPrefix(line, "[gpu ") {
if idx := strings.Index(line, "] "); idx >= 0 {
return line[idx+2:]
}
}
return line
}
func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
summary := BenchmarkTelemetrySummary{}
if len(rows) == 0 {
return summary
}
temps := make([]float64, 0, len(rows))
powers := make([]float64, 0, len(rows))
clocks := make([]float64, 0, len(rows))
memClocks := make([]float64, 0, len(rows))
usages := make([]float64, 0, len(rows))
memUsages := make([]float64, 0, len(rows))
summary.DurationSec = rows[len(rows)-1].ElapsedSec
summary.Samples = len(rows)
for _, row := range rows {
temps = append(temps, row.TempC)
powers = append(powers, row.PowerW)
clocks = append(clocks, row.ClockMHz)
memClocks = append(memClocks, row.MemClockMHz)
usages = append(usages, row.UsagePct)
memUsages = append(memUsages, row.MemUsagePct)
}
summary.AvgTempC = benchmarkMean(temps)
summary.P95TempC = benchmarkPercentile(temps, 95)
summary.AvgPowerW = benchmarkMean(powers)
summary.P95PowerW = benchmarkPercentile(powers, 95)
summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
summary.AvgUsagePct = benchmarkMean(usages)
summary.AvgMemUsagePct = benchmarkMean(memUsages)
summary.ClockCVPct = benchmarkCV(clocks)
summary.PowerCVPct = benchmarkCV(powers)
summary.TempCVPct = benchmarkCV(temps)
summary.ClockDriftPct = benchmarkClockDrift(clocks)
return summary
}
func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
if len(rows) == 0 {
return nil
}
var rpmValues []float64
var dutyValues []float64
for _, row := range rows {
if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM)
}
if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct)
}
}
if len(rpmValues) == 0 && len(dutyValues) == 0 {
return nil
}
summary := &BenchmarkCoolingSummary{
Available: true,
AvgFanRPM: benchmarkMean(rpmValues),
}
if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
} else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
}
return summary
}
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score := BenchmarkScorecard{}
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
// Each precision ran alone with full GPU dedicated — peak capability.
for _, p := range gpu.PrecisionSteady {
score.SyntheticScore += p.WeightedTeraOpsPerSec
}
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
// All precisions compete simultaneously — closer to real inference workloads.
for _, p := range gpu.PrecisionResults {
if p.Supported {
score.MixedScore += p.WeightedTeraOpsPerSec
}
}
// MixedEfficiency = MixedScore / SyntheticScore.
// Measures how well the GPU sustains throughput under concurrent mixed load.
// A healthy GPU scores ~0.80.95; severe degradation suggests bandwidth
// contention or scheduler inefficiency.
if score.SyntheticScore > 0 && score.MixedScore > 0 {
score.MixedEfficiency = score.MixedScore / score.SyntheticScore
}
// ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3).
// SyntheticScore is the primary signal; MixedEfficiency adds up to +30%
// bonus for GPUs that handle mixed-precision concurrency well.
// Falls back to MixedScore alone when per-precision data is absent.
switch {
case score.SyntheticScore > 0:
score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3)
case score.MixedScore > 0:
score.ComputeScore = score.MixedScore
}
// PowerSustainScore: measures how close the GPU came to its rated TDP under
// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
// Penalty applied symmetrically for both under- and over-TDP deviations:
// score = max(0, 100 |measured rated| / rated × 100)
// Under-TDP → power delivery / cooling issue.
// Over-TDP → power limit not properly enforced / power regulation fault.
// Falls back to 0 if calibration was not performed (dcgmi unavailable).
{
ref := gpu.DefaultPowerLimitW
if ref <= 0 {
ref = gpu.PowerLimitW
}
if gpu.CalibratedPeakPowerW > 0 && ref > 0 {
deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100
score.PowerSustainScore = clampScore(100 - deviationPct)
}
}
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
// StabilityScore: prefer per-precision steady phases where each window runs a
// single kernel type so PowerCVPct is a genuine stability signal (not a
// workload-mix artifact). Fall back to combined steady using clock-only metrics
// when per-precision data is absent (older results, short profiles).
if len(gpu.PrecisionSteady) > 0 {
var sum float64
for _, p := range gpu.PrecisionSteady {
sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2))
}
score.StabilityScore = sum / float64(len(gpu.PrecisionSteady))
} else {
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2))
}
score.CompositeScore = compositeBenchmarkScore(score)
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
}
return score
}
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
// Weights after introducing calibrated power reference:
// base 0.35 — floor so a GPU that fails all sustain checks still scores
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
// stability 0.25 — clock/power variance matters for reproducibility
// power 0.15 — GPU reaches rated TDP under targeted_power? lower weight
// because calibration may be absent (dcgmi not installed)
// NCCL bonus 0.10 — interconnect health
// cap 1.10
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
if score.InterconnectScore > 0 {
quality += 0.10
}
if quality > 1.10 {
quality = 1.10
}
return score.ComputeScore * quality
}
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
var reasons []string
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
reasons = append(reasons, "power_capped")
}
if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
reasons = append(reasons, "thermal_limited")
}
if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
reasons = append(reasons, "sync_boost_limited")
}
if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
reasons = append(reasons, "low_sm_clock_vs_target")
}
if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
reasons = append(reasons, "variance_too_high")
}
if normalizationStatus != "full" {
reasons = append(reasons, "normalization_partial")
}
if gpu.PowerLimitDerated {
reasons = append(reasons, "power_limit_derated")
}
if gpu.ECC.Uncorrected > 0 {
reasons = append(reasons, "ecc_uncorrected_errors")
}
if gpu.ECC.Corrected > 0 {
reasons = append(reasons, "ecc_corrected_errors")
}
return dedupeStrings(reasons)
}
func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
result := &BenchmarkInterconnectResult{
Status: "UNSUPPORTED",
Attempted: true,
SelectedGPUIndices: append([]int(nil), gpuIndices...),
}
cmd := []string{
"all_reduce_perf",
"-b", "512M",
"-e", "4G",
"-f", "2",
"-g", strconv.Itoa(len(gpuIndices)),
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
}
env := []string{
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
}
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
if err != nil {
result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
return result
}
avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
result.Status = "OK"
result.Supported = true
result.AvgAlgBWGBps = avgAlg
result.MaxAlgBWGBps = maxAlg
result.AvgBusBWGBps = avgBus
result.MaxBusBWGBps = maxBus
return result
}
func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
var algs []float64
var buses []float64
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
fields := strings.Fields(line)
if len(fields) < 8 {
continue
}
for i := 0; i+2 < len(fields); i++ {
timeVal, err1 := strconv.ParseFloat(fields[i], 64)
algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
algs = append(algs, algVal)
buses = append(buses, busVal)
break
}
}
}
if len(algs) == 0 {
return 0, 0, 0, 0
}
return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
}
func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
out, err := satExecCommand(
"nvidia-smi",
"--id="+strconv.Itoa(gpuIndex),
"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return BenchmarkThrottleCounters{}, err
}
fields := strings.Split(strings.TrimSpace(string(out)), ",")
if len(fields) < 5 {
return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
}
return BenchmarkThrottleCounters{
SWPowerCapUS: parseBenchmarkUint64(fields[0]),
SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]),
SyncBoostUS: parseBenchmarkUint64(fields[2]),
HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]),
HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
}, nil
}
func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
return BenchmarkThrottleCounters{
SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
}
}
func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) {
out, err := satExecCommand(
"nvidia-smi",
"--id="+strconv.Itoa(gpuIndex),
"--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return BenchmarkECCCounters{}, err
}
fields := strings.Split(strings.TrimSpace(string(out)), ",")
if len(fields) < 2 {
return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out)))
}
corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64)
uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64)
if err1 != nil || err2 != nil {
// ECC may be disabled on this GPU — return zero counters silently.
return BenchmarkECCCounters{}, nil
}
return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil
}
func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters {
return BenchmarkECCCounters{
Corrected: saturatingSub(after.Corrected, before.Corrected),
Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected),
}
}
func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
args := []string{
"--query-compute-apps=gpu_uuid,pid,process_name",
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
}
out, err := satExecCommand("nvidia-smi", args...).Output()
if err != nil {
return nil, err
}
var lines []string
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
lines = append(lines, line)
}
return lines, nil
}
func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
if gpu.Status == "" {
gpu.Status = "OK"
}
if gpu.Scores.CompositeScore == 0 {
gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
}
return gpu
}
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
var findings []string
passed := 0
for _, gpu := range result.GPUs {
if gpu.Status == "OK" {
passed++
}
}
total := len(result.GPUs)
if total > 0 {
if passed == total {
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
} else {
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
}
}
if result.Normalization.Status != "full" {
findings = append(findings, "Environment normalization was partial; compare results with caution.")
}
for _, gpu := range result.GPUs {
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
continue
}
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
continue
}
for _, reason := range gpu.DegradationReasons {
switch reason {
case "power_capped":
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
case "thermal_limited":
findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
case "sync_boost_limited":
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
case "low_sm_clock_vs_target":
findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
case "variance_too_high":
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
case "normalization_partial":
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
case "power_limit_derated":
findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW))
case "ecc_uncorrected_errors":
findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
case "ecc_corrected_errors":
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
}
}
if len(gpu.PrecisionFailures) > 0 {
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
}
if gpu.Backend == "driver-ptx" {
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
}
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
findings = append(findings, fmt.Sprintf(
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
))
}
// Flag significant TDP deviation (over or under) from calibration.
if gpu.CalibratedPeakPowerW > 0 {
ref := gpu.DefaultPowerLimitW
if ref <= 0 {
ref = gpu.PowerLimitW
}
if ref > 0 {
deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100
switch {
case deviationPct < -10:
findings = append(findings, fmt.Sprintf(
"GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.",
gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref,
))
case deviationPct > 5:
findings = append(findings, fmt.Sprintf(
"GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.",
gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct,
))
}
}
}
}
if result.Interconnect != nil && result.Interconnect.Supported {
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
}
if cl := result.CPULoad; cl != nil {
switch cl.Status {
case "high":
findings = append(findings, fmt.Sprintf(
"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
cl.AvgPct, cl.MaxPct,
))
case "unstable":
findings = append(findings, fmt.Sprintf(
"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
cl.AvgPct, cl.P95Pct,
))
}
}
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
if sp.ReportingRatio < 0.75 {
findings = append(findings, fmt.Sprintf(
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
))
} else if sp.ReportingRatio > 1.25 {
findings = append(findings, fmt.Sprintf(
"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
))
}
}
return dedupeStrings(findings)
}
func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
if len(result.GPUs) == 0 {
return "FAILED"
}
hasOK := false
hasPartial := result.Normalization.Status != "full"
for _, gpu := range result.GPUs {
switch gpu.Status {
case "OK":
hasOK = true
case "PARTIAL", "UNSUPPORTED":
hasPartial = true
}
}
if !hasOK {
return "FAILED"
}
if hasPartial {
return "PARTIAL"
}
return "OK"
}
func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
for i := range items {
if items[i].Index == idx {
return &items[i]
}
}
return nil
}
func classifySATErrorStatus(out []byte, err error) string {
status, _ := classifySATResult("benchmark", out, err)
if status == "UNSUPPORTED" {
return "UNSUPPORTED"
}
return "FAILED"
}
func parseBenchmarkFloat(raw string) float64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
return 0
}
value, _ := strconv.ParseFloat(raw, 64)
return value
}
func parseBenchmarkUint64(raw string) uint64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
return 0
}
value, _ := strconv.ParseUint(raw, 10, 64)
return value
}
func benchmarkMean(values []float64) float64 {
if len(values) == 0 {
return 0
}
var sum float64
for _, value := range values {
sum += value
}
return sum / float64(len(values))
}
func benchmarkPercentile(values []float64, p float64) float64 {
if len(values) == 0 {
return 0
}
copyValues := append([]float64(nil), values...)
sort.Float64s(copyValues)
if len(copyValues) == 1 {
return copyValues[0]
}
rank := (p / 100.0) * float64(len(copyValues)-1)
lower := int(math.Floor(rank))
upper := int(math.Ceil(rank))
if lower == upper {
return copyValues[lower]
}
frac := rank - float64(lower)
return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
}
func benchmarkCV(values []float64) float64 {
if len(values) == 0 {
return 0
}
mean := benchmarkMean(values)
if mean == 0 {
return 0
}
var variance float64
for _, value := range values {
diff := value - mean
variance += diff * diff
}
variance /= float64(len(values))
return math.Sqrt(variance) / mean * 100
}
func benchmarkClockDrift(values []float64) float64 {
if len(values) < 4 {
return 0
}
window := len(values) / 4
if window < 1 {
window = 1
}
head := benchmarkMean(values[:window])
tail := benchmarkMean(values[len(values)-window:])
if head <= 0 || tail >= head {
return 0
}
return ((head - tail) / head) * 100
}
func benchmarkMax(values []float64) float64 {
var max float64
for i, value := range values {
if i == 0 || value > max {
max = value
}
}
return max
}
func clampScore(value float64) float64 {
switch {
case value < 0:
return 0
case value > 100:
return 100
default:
return value
}
}
func dedupeStrings(values []string) []string {
if len(values) == 0 {
return nil
}
seen := make(map[string]struct{}, len(values))
out := make([]string, 0, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
return out
}
func saturatingSub(after, before uint64) uint64 {
if after <= before {
return 0
}
return after - before
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading")
out, err := cmd.Output()
if err != nil {
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
}
if w := parseDCMIPowerReading(string(out)); w > 0 {
return w, nil
}
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
}
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
// durationSec seconds. Returns the mean of all successful samples.
// Returns 0, false if IPMI is unavailable.
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
if durationSec <= 0 {
return 0, false
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var samples []float64
loop:
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
if time.Now().After(deadline) {
break
}
select {
case <-ctx.Done():
break loop
case <-time.After(2 * time.Second):
}
}
if len(samples) == 0 {
return 0, false
}
var sum float64
for _, w := range samples {
sum += w
}
return sum / float64(len(samples)), true
}
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
// IPMI samples plus the GPU-reported average power during steady state.
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
sp := &BenchmarkServerPower{Available: ipmiAvailable}
if !ipmiAvailable {
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
return sp
}
sp.IdleW = idleW
sp.LoadedW = loadedW
sp.DeltaW = loadedW - idleW
sp.GPUReportedSumW = gpuReportedSumW
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
}
return sp
}
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
// Returns empty string if unavailable (non-Linux or missing DMI entry).
func readServerModel() string {
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}
// filterRowsByGPU returns only the metric rows for a specific GPU index.
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
var out []GPUMetricRow
for _, r := range rows {
if r.GPUIndex == gpuIndex {
out = append(out, r)
}
}
return out
}
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
// and returns a per-GPU parse result map.
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
gpuLines := make(map[int][]string)
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "[gpu ") {
continue
}
end := strings.Index(line, "] ")
if end < 0 {
continue
}
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
if err != nil {
continue
}
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
}
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
for gpuIdx, lines := range gpuLines {
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
}
return results
}
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
// simultaneously using a single bee-gpu-burn invocation per phase.
func runNvidiaBenchmarkParallel(
ctx context.Context,
verboseLog, runDir string,
selected []int,
infoByIndex map[int]benchmarkGPUInfo,
opts NvidiaBenchmarkOptions,
spec benchmarkProfileSpec,
logFunc func(string),
result *NvidiaBenchmarkResult,
calibByIndex map[int]benchmarkPowerCalibrationResult,
serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
allMetricRows *[]GPUMetricRow,
metricTimelineSec *float64,
gpuBurnLog string,
) {
allDevices := joinIndexList(selected)
// Build per-GPU result stubs.
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
for _, idx := range selected {
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
if info, ok := infoByIndex[idx]; ok {
r.UUID = info.UUID
r.Name = info.Name
r.BusID = info.BusID
r.VBIOS = info.VBIOS
r.PowerLimitW = info.PowerLimitW
r.MultiprocessorCount = info.MultiprocessorCount
r.DefaultPowerLimitW = info.DefaultPowerLimitW
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if calib, ok := calibByIndex[idx]; ok {
r.CalibratedPeakPowerW = calib.Summary.P95PowerW
r.CalibratedPeakTempC = calib.Summary.P95TempC
r.PowerCalibrationTries = calib.Attempts
r.PowerLimitDerated = calib.Derated
r.Notes = append(r.Notes, calib.Notes...)
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
gpuResults[idx] = r
}
// Baseline: sample all GPUs together.
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
}
}
for _, idx := range selected {
perGPU := filterRowsByGPU(baselineRows, idx)
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec))
// Sample server idle power once.
if !*serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
*serverIdleW = w
*serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
// Warmup: all GPUs simultaneously.
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
if warmupErr != nil {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
}
}
warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut))
supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...)
for _, idx := range selected {
if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" {
if gpuResults[idx].ComputeCapability == "" {
gpuResults[idx].ComputeCapability = pr.ComputeCapability
}
if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) {
supportedPrecisions = ccPrecisions
}
}
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPUs stay hot between windows.
eccBase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccBase[idx], _ = queryECCCounters(idx)
}
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
if label == "mixed" {
return "steady"
}
return "gpu-all-steady-" + label
})
planCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(basePhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
"--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range supportedPrecisions {
phaseLogName := "gpu-all-steady-" + prec
phaseRows := phaseRowsByStage[phaseLogName]
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
for _, idx := range selected {
perGPU := filterRowsByGPU(phaseRows, idx)
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Status: "OK",
Steady: summarizeBenchmarkTelemetry(perGPU),
}
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
phase.Status = status
phase.Notes = note
gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status)
}
if pr, ok := parseByGPU[idx]; ok {
for _, p := range pr.Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
}
gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase)
}
}
// Snapshot throttle counters before steady.
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
beforeThrottle[idx], _ = queryThrottleCounters(idx)
}
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
// Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW
(*serverLoadedSamples)++
*serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
}
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
afterThrottle[idx], _ = queryThrottleCounters(idx)
}
steadyRows := phaseRowsByStage["steady"]
parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))
for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx)
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal)
}
if pr, ok := parseResults[idx]; ok {
gpuResults[idx].ComputeCapability = pr.ComputeCapability
gpuResults[idx].Backend = pr.Backend
gpuResults[idx].PrecisionResults = pr.Profiles
if pr.Fallback {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
}
if planErr != nil {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
}
}
// Cooldown: all GPUs together.
if spec.CooldownSec > 0 {
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
}
}
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec))
}
// Score and finalize each GPU.
for _, idx := range selected {
r := gpuResults[idx]
r.Scores = scoreBenchmarkGPUResult(*r)
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
pr := parseResults[idx]
switch {
case planErr != nil:
r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
case len(r.PrecisionFailures) > 0:
r.Status = "PARTIAL"
case pr.Fallback:
r.Status = "PARTIAL"
default:
r.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
}
}
// readBenchmarkHostConfig reads static CPU and memory configuration from
// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
func readBenchmarkHostConfig() *BenchmarkHostConfig {
cfg := &BenchmarkHostConfig{}
populated := false
// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
socketIDs := map[string]struct{}{}
coresPerSocket := map[string]int{}
var modelName string
threads := 0
for _, line := range strings.Split(string(data), "\n") {
kv := strings.SplitN(line, ":", 2)
if len(kv) != 2 {
continue
}
key := strings.TrimSpace(kv[0])
val := strings.TrimSpace(kv[1])
switch key {
case "processor":
threads++
case "model name":
if modelName == "" {
modelName = val
}
case "physical id":
socketIDs[val] = struct{}{}
case "cpu cores":
// Overwrite per-socket core count (last wins per socket, but all
// entries for the same socket report the same value).
if physLine := ""; physLine == "" {
// We accumulate below by treating cpu cores as a per-thread
// field; sum by socket requires a two-pass approach. Use the
// simpler approximation: totalCores = threads / (threads per core).
_ = val
}
}
}
// Second pass: per-socket core count.
var curSocket string
for _, line := range strings.Split(string(data), "\n") {
kv := strings.SplitN(line, ":", 2)
if len(kv) != 2 {
continue
}
key := strings.TrimSpace(kv[0])
val := strings.TrimSpace(kv[1])
switch key {
case "physical id":
curSocket = val
case "cpu cores":
if curSocket != "" {
if _, seen := coresPerSocket[curSocket]; !seen {
v, _ := strconv.Atoi(val)
coresPerSocket[curSocket] = v
}
}
}
}
totalCores := 0
for _, c := range coresPerSocket {
totalCores += c
}
cfg.CPUModel = modelName
cfg.CPUSockets = len(socketIDs)
if cfg.CPUSockets == 0 && threads > 0 {
cfg.CPUSockets = 1
}
cfg.CPUCores = totalCores
cfg.CPUThreads = threads
if modelName != "" || threads > 0 {
populated = true
}
}
// Parse /proc/meminfo for total physical RAM.
if data, err := os.ReadFile("/proc/meminfo"); err == nil {
for _, line := range strings.Split(string(data), "\n") {
if strings.HasPrefix(line, "MemTotal:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
kb, _ := strconv.ParseUint(fields[1], 10, 64)
cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
populated = true
}
break
}
}
}
if !populated {
return nil
}
return cfg
}
// startCPULoadSampler starts a goroutine that samples host CPU load every
// intervalSec seconds until stopCh is closed, then sends the collected
// samples on the returned channel.
func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
ch := make(chan []float64, 1)
go func() {
var samples []float64
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- samples
return
case <-ticker.C:
if pct := sampleCPULoadPct(); pct > 0 {
samples = append(samples, pct)
}
}
}
}()
return ch
}
// summarizeCPULoad computes stats over sampled CPU load values and assigns
// a health status.
func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
if len(samples) == 0 {
return nil
}
sorted := append([]float64(nil), samples...)
sort.Float64s(sorted)
var sum float64
for _, v := range sorted {
sum += v
}
avg := sum / float64(len(sorted))
p95 := sorted[int(float64(len(sorted))*0.95)]
max := sorted[len(sorted)-1]
cl := &BenchmarkCPULoad{
AvgPct: math.Round(avg*10) / 10,
MaxPct: math.Round(max*10) / 10,
P95Pct: math.Round(p95*10) / 10,
Samples: len(sorted),
}
// Compute standard deviation to detect instability.
var variance float64
for _, v := range sorted {
d := v - avg
variance += d * d
}
stdDev := math.Sqrt(variance / float64(len(sorted)))
switch {
case avg > 20 || max > 40:
cl.Status = "high"
cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
case stdDev > 12:
cl.Status = "unstable"
cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
default:
cl.Status = "ok"
}
return cl
}
// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
// throttle counters. If a GPU starts throttling, the current targeted_power run
// is canceled immediately, the power limit is reduced, and a fresh full cycle
// is started again from the beginning. The selected reduced power limit stays
// active for the main benchmark and is restored by the caller afterwards.
func runBenchmarkPowerCalibration(
ctx context.Context,
verboseLog, runDir string,
gpuIndices []int,
infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string),
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
const calibDurationSec = 120
const derateStepW = 25
const maxDerateW = 150
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
// doubling each retry until it would exceed the cap, at which point the
// next busy response fails the calibration immediately.
const dcgmResourceBusyMaxDelaySec = 300
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil
}
canDerate := os.Geteuid() == 0
if !canDerate {
logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled")
}
type calibrationAttemptResult struct {
out []byte
rows []GPUMetricRow
err error
}
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
var restore []benchmarkRestoreAction
for _, idx := range gpuIndices {
info := infoByIndex[idx]
originalLimitW := int(math.Round(info.PowerLimitW))
if originalLimitW <= 0 {
originalLimitW = int(math.Round(info.DefaultPowerLimitW))
}
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
if defaultLimitW <= 0 {
defaultLimitW = originalLimitW
}
appliedLimitW := originalLimitW
if appliedLimitW <= 0 {
appliedLimitW = defaultLimitW
}
minLimitW := appliedLimitW
switch {
case defaultLimitW > 0:
minLimitW = defaultLimitW - maxDerateW
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
if minLimitW < floorByRatio {
minLimitW = floorByRatio
}
case appliedLimitW > 0:
minLimitW = appliedLimitW - maxDerateW
}
if minLimitW < derateStepW {
minLimitW = derateStepW
}
calib := benchmarkPowerCalibrationResult{
AppliedPowerLimitW: float64(appliedLimitW),
}
busyRetries := 0
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
if canDerate && originalLimitW > 0 {
idxCopy := idx
orig := originalLimitW
restore = append(restore, benchmarkRestoreAction{
name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy),
fn: func() {
_ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig)
},
})
}
calibLoop:
for {
calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
beforeThrottle, _ := queryThrottleCounters(idx)
attemptCtx, cancel := context.WithCancel(ctx)
doneCh := make(chan calibrationAttemptResult, 1)
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
}()
ticker := time.NewTicker(time.Second)
var (
attempt calibrationAttemptResult
throttleReason string
)
attemptLoop:
for {
select {
case attempt = <-doneCh:
break attemptLoop
case <-ticker.C:
afterThrottle, err := queryThrottleCounters(idx)
if err != nil {
continue
}
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
throttleReason = reason
cancel()
}
case <-ctx.Done():
cancel()
attempt = <-doneCh
break attemptLoop
}
}
ticker.Stop()
cancel()
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
perGPU := filterRowsByGPU(attempt.rows, idx)
summary := summarizeBenchmarkTelemetry(perGPU)
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
calib.Summary = summary
calib.Completed = true
calib.AppliedPowerLimitW = float64(appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
break
}
// If DCGM reports the resource is in use, nv-hostengine has not yet
// released the diagnostic slot from the previous attempt. Do not
// derate: wait with exponential back-off and retry at the same
// power limit. Once the back-off delay would exceed
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
// held by something else.
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
break
}
busyRetries++
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
select {
case <-ctx.Done():
break calibLoop
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
}
busyDelaySec = next
continue calibLoop
}
busyRetries = 0 // reset on any non-busy outcome
busyDelaySec = 1 // reset back-off
switch {
case throttleReason != "":
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
case attempt.err != nil:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
default:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
}
if !canDerate || appliedLimitW <= 0 {
break
}
nextLimitW := appliedLimitW - derateStepW
if nextLimitW < minLimitW {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
break
}
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
break
}
appliedLimitW = nextLimitW
calib.AppliedPowerLimitW = float64(appliedLimitW)
calib.Derated = true
info.PowerLimitW = float64(appliedLimitW)
infoByIndex[idx] = info
calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
}
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
results[idx] = calib
}
}
return results, restore
}
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
// meaning nv-hostengine still holds the diagnostic slot from a prior run.
func isDCGMResourceBusy(err error) bool {
var exitErr *exec.ExitError
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return 300
case NvidiaBenchmarkProfileOvernight:
return 600
default:
return 120
}
}
func occupiedSlots(indices []int, current int) []int {
out := make([]int, 0, len(indices))
for _, idx := range indices {
if idx != current {
out = append(out, idx)
}
}
return out
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
for k, v := range src {
out[k] = v
}
return out
}
func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
var b strings.Builder
b.WriteString("# Bee Bench Power Report\n\n")
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus)
if len(result.Findings) > 0 {
b.WriteString("## Summary\n\n")
for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding)
}
b.WriteString("\n")
}
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString("## Recommended Slot Order\n\n")
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
}
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
for _, step := range result.RampSteps {
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
}
b.WriteString("\n")
}
b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
}
b.WriteString("\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
if gpu.OccupiedSlotsNote != "" {
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
}
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
return b.String()
}
func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
if len(result.RecommendedSlotOrder) > 0 {
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
}
for _, step := range result.RampSteps {
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
}
return b.String()
}
func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-bench/power"
}
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs selected")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "power-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkProfile: opts.Profile,
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
}
}()
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
for _, idx := range selected {
info := infoByIndex[idx]
calib := calibByIndex[idx]
status := "OK"
if !calib.Completed {
status = "FAILED"
result.OverallStatus = "PARTIAL"
} else if calib.Derated {
status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
occupied := occupiedSlots(selected, idx)
note := ""
if len(occupied) > 0 {
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
}
gpus = append(gpus, NvidiaPowerBenchGPU{
Index: idx,
Name: info.Name,
BusID: info.BusID,
DefaultPowerLimitW: info.DefaultPowerLimitW,
AppliedPowerLimitW: calib.AppliedPowerLimitW,
MaxObservedPowerW: calib.Summary.P95PowerW,
MaxObservedTempC: calib.Summary.P95TempC,
CalibrationAttempts: calib.Attempts,
Derated: calib.Derated,
Status: status,
OccupiedSlots: occupied,
OccupiedSlotsNote: note,
Notes: append([]string(nil), calib.Notes...),
})
}
sort.Slice(gpus, func(i, j int) bool {
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
}
if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
}
if gpus[i].Derated != gpus[j].Derated {
return !gpus[i].Derated
}
return gpus[i].Index < gpus[j].Index
})
result.GPUs = gpus
result.RecommendedSlotOrder = make([]int, 0, len(gpus))
for _, gpu := range gpus {
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
}
if len(result.RecommendedSlotOrder) > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
}
for _, gpu := range gpus {
if gpu.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
}
}
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
for _, gpu := range gpus {
singleByIndex[gpu.Index] = gpu
}
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
for i := len(stepRestore) - 1; i >= 0; i-- {
stepRestore[i].fn()
}
ramp := NvidiaPowerBenchStep{
StepIndex: step,
GPUIndices: subset,
Status: "OK",
}
var realizationValues []float64
for _, idx := range subset {
calib := stepCalib[idx]
ramp.TotalObservedPowerW += calib.Summary.P95PowerW
if calib.Derated {
ramp.DeratedGPUCount++
ramp.Status = "PARTIAL"
}
if !calib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
continue
}
if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
realizationValues = append(realizationValues, realization)
}
}
if len(subset) > 0 {
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
}
if len(realizationValues) > 0 {
ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
ramp.MinPowerRealizationPct = realizationValues[0]
for _, v := range realizationValues[1:] {
if v < ramp.MinPowerRealizationPct {
ramp.MinPowerRealizationPct = v
}
}
}
if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
if ramp.DeratedGPUCount > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
}
result.RampSteps = append(result.RampSteps, ramp)
}
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal power result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return "", fmt.Errorf("write result.json: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
return "", fmt.Errorf("write summary.txt: %w", err)
}
return runDir, nil
}