Files
bee/audit/internal/platform/benchmark.go
2026-04-16 11:43:01 +03:00

3642 lines
127 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package platform
import (
"context"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
)
const benchmarkVersion = "2"
type benchmarkProfileSpec struct {
Name string
BaselineSec int
WarmupSec int
SteadySec int
NCCLSec int
CooldownSec int
}
type benchmarkGPUInfo struct {
Index int
UUID string
Name string
BusID string
VBIOS string
PowerLimitW float64
DefaultPowerLimitW float64
MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64
MultiprocessorCount int
// Temperature limits sourced from nvidia-smi -q verbose output.
// ShutdownTempC is the hardware thermal shutdown threshold.
// SlowdownTempC is the software throttle onset threshold.
// Both fall back to safe conservative defaults when not available.
ShutdownTempC float64 // fallback: 90°C
SlowdownTempC float64 // fallback: 80°C
}
type benchmarkPowerCalibrationResult struct {
Summary BenchmarkTelemetrySummary
AppliedPowerLimitW float64
Attempts int
Derated bool
Completed bool
Notes []string
// CoolingWarning is set when the GPU throttled thermally with a clock drop
// ≥20% while server fans were below 100% duty cycle — a signal that the
// cooling system may not be correctly configured for full GPU load.
CoolingWarning string
}
type benchmarkBurnProfile struct {
name string
category string
supported bool
lanes int
m uint64
n uint64
k uint64
iterations uint64
notes string
}
type benchmarkBurnParseResult struct {
Device string
ComputeCapability string
Backend string
DurationSec int
Profiles []BenchmarkPrecisionResult
Fallback bool
}
type benchmarkRestoreAction struct {
name string
fn func()
}
var (
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
)
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
//
// fp64 and fp4 are intentionally disabled for now: both are currently unstable
// on the target fleet and can abort the mixed steady stage after the earlier
// phases already collected useful telemetry.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32"}
func computeCapabilityCode(raw string) int {
raw = strings.TrimSpace(raw)
if raw == "" {
return 0
}
parts := strings.SplitN(raw, ".", 2)
major, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
minor := 0
if len(parts) > 1 {
minor, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
}
return major*10 + minor
}
func benchmarkSupportedPrecisions(computeCapability string) []string {
cc := computeCapabilityCode(computeCapability)
out := make([]string, 0, len(benchmarkPrecisionPhases))
for _, prec := range benchmarkPrecisionPhases {
if prec == "fp4" && cc > 0 && cc < 100 {
continue
}
out = append(out, prec)
}
return out
}
func benchmarkPrecisionEnabled(category string) bool {
switch category {
case "int8", "fp8", "fp16", "fp16_bf16", "fp32", "fp32_tf32":
return true
default:
return false
}
}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
if len(precisions) == 0 {
precisions = append([]string(nil), benchmarkPrecisionPhases...)
}
switch spec.Name {
case NvidiaBenchmarkProfileStandard:
basePhaseSec = 60
mixedPhaseSec = 300
case NvidiaBenchmarkProfileStability:
basePhaseSec = 300
mixedPhaseSec = 3600
case NvidiaBenchmarkProfileOvernight:
basePhaseSec = 3600
mixedPhaseSec = 14400
default:
totalWeight := len(precisions) + 5
if totalWeight <= 0 {
return nil, nil, 0, 0
}
basePhaseSec = spec.SteadySec / totalWeight
if basePhaseSec <= 0 {
basePhaseSec = 1
}
mixedPhaseSec = basePhaseSec * 5
}
planLabels = make([]string, 0, len(precisions)+1)
planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1)
for _, prec := range precisions {
planLabels = append(planLabels, prec)
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: prec,
MetricStage: metricStage(prec),
DurationSec: basePhaseSec,
})
}
planLabels = append(planLabels, "mixed")
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: "mixed",
MetricStage: metricStage("mixed"),
DurationSec: mixedPhaseSec,
})
return planLabels, planPhases, basePhaseSec, mixedPhaseSec
}
func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
values := make([]string, 0, len(phases))
for _, phase := range phases {
values = append(values, strconv.Itoa(phase.DurationSec))
}
return strings.Join(values, ",")
}
func benchmarkPlannedPhaseStatus(raw []byte) (string, string) {
text := strings.ToLower(strings.TrimSpace(string(raw)))
switch {
case text == "":
return "FAILED", "phase produced no output"
case strings.Contains(text, "phase_error="):
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") {
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
}
return "FAILED", "precision phase failed"
case strings.Contains(text, "status=failed"):
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") {
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
}
return "FAILED", "precision phase failed"
default:
return "OK", ""
}
}
func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string {
diff := diffThrottleCounters(before, after)
switch {
case diff.HWThermalSlowdownUS > 0:
return "hw_thermal"
case diff.SWThermalSlowdownUS > 0:
return "sw_thermal"
case diff.HWPowerBrakeSlowdownUS > 0:
return "hw_power_brake"
default:
return ""
}
}
func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error {
if powerLimitW <= 0 {
return fmt.Errorf("invalid power limit %d", powerLimitW)
}
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{
"nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW),
}, nil, nil)
if err != nil {
return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out)))
}
return nil
}
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-bench/perf"
}
spec := resolveBenchmarkProfile(opts.Profile)
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs selected")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "perf-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
hostname, _ := os.Hostname()
result := NvidiaBenchmarkResult{
BenchmarkVersion: benchmarkVersion,
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkProfile: spec.Name,
ParallelGPUs: opts.ParallelGPUs,
RampStep: opts.RampStep,
RampTotal: opts.RampTotal,
RampRunID: opts.RampRunID,
SelectedGPUIndices: append([]int(nil), selected...),
HostConfig: readBenchmarkHostConfig(),
Normalization: BenchmarkNormalization{
Status: "full",
},
}
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
var metricRows []GPUMetricRow
metricTimelineSec := 0.0
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
// Server power characterization state — populated during per-GPU phases.
var serverIdleW, serverLoadedWSum float64
var serverIdleOK, serverLoadedOK bool
var serverLoadedSamples int
// Run nvidia-smi -q first: used both for the log file and as a fallback
// source of max clock values when CSV clock fields are unsupported.
var nvsmiQOut []byte
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
nvsmiQOut = out
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
}
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
result.Normalization.Status = "partial"
}
// Enrich with max clocks from verbose output — covers GPUs where
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
activeApps, err := queryActiveComputeApps(selected)
if err == nil && len(activeApps) > 0 {
result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
result.Normalization.Status = "partial"
}
restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
defer func() {
for i := len(restoreActions) - 1; i >= 0; i-- {
restoreActions[i].fn()
}
}()
// No power calibration before performance benchmark — GPUs run at their
// default power limits. PowerSustainScore is derived from steady-state power
// observed during the benchmark itself.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult)
// Start background CPU load sampler — samples every 10s during GPU phases.
cpuStopCh := make(chan struct{})
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog)
} else {
for _, idx := range selected {
gpuResult := BenchmarkGPUResult{
Index: idx,
Status: "FAILED",
}
if info, ok := infoByIndex[idx]; ok {
gpuResult.UUID = info.UUID
gpuResult.Name = info.Name
gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.ShutdownTempC = info.ShutdownTempC
gpuResult.SlowdownTempC = info.SlowdownTempC
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if calib, ok := calibByIndex[idx]; ok {
gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW
gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC
gpuResult.PowerCalibrationTries = calib.Attempts
gpuResult.PowerLimitDerated = calib.Derated
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
if calib.CoolingWarning != "" {
gpuResult.CoolingWarning = calib.CoolingWarning
}
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
}
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec))
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
warmupParse := parseBenchmarkBurnLog(string(warmupOut))
if gpuResult.ComputeCapability == "" {
gpuResult.ComputeCapability = warmupParse.ComputeCapability
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPU stays hot between windows.
eccBase, _ := queryECCCounters(idx)
supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability)
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
if label == "mixed" {
return fmt.Sprintf("gpu-%d-steady", idx)
}
return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
})
planCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(basePhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range supportedPrecisions {
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
phaseRows := phaseRowsByStage[stageName]
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Status: "OK",
Steady: summarizeBenchmarkTelemetry(phaseRows),
}
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
phase.Status = status
phase.Notes = note
gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status)
}
for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
}
beforeThrottle, _ := queryThrottleCounters(idx)
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
afterThrottle, _ := queryThrottleCounters(idx)
if planErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
}
steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
if parseResult.Fallback {
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
if spec.CooldownSec > 0 {
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
}
applyBenchmarkSteadyFallback(&gpuResult)
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
gpuResult.Notes = append(gpuResult.Notes,
fmt.Sprintf("[HARD STOP] GPU %d: %s", idx, anomaly))
}
if warn := detectSlowdownTempExceedance(metricRows, idx, gpuResult.SlowdownTempC); warn != "" {
gpuResult.Notes = append(gpuResult.Notes,
fmt.Sprintf("[WARNING] GPU %d: %s", idx, warn))
if gpuResult.Status == "OK" {
gpuResult.Status = "PARTIAL"
}
}
if planErr != nil {
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if len(gpuResult.PrecisionFailures) > 0 {
gpuResult.Status = "PARTIAL"
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
}
} // end sequential path
// Performance scalability ramp-up: run parallel benchmarks for k=2..N GPUs
// and compute compute scalability relative to the best single-GPU result.
// Only runs in sequential mode (each GPU was tested individually above) and
// when there are at least 2 GPUs.
if !opts.ParallelGPUs && len(selected) >= 2 {
// Find the best single-card SyntheticScore as the 1-GPU baseline.
var bestTOPS float64
for _, g := range result.GPUs {
if g.Scores.SyntheticScore > bestTOPS {
bestTOPS = g.Scores.SyntheticScore
}
}
if bestTOPS > 0 {
var rampSteps []NvidiaPerformanceRampStep
var scalabilityPcts []float64
for k := 2; k <= len(selected); k++ {
subset := append([]int(nil), selected[:k]...)
rampDir := filepath.Join(runDir, fmt.Sprintf("ramp-%02d", k))
_ = os.MkdirAll(rampDir, 0755)
logFunc(fmt.Sprintf("performance ramp: step %d/%d — running %d GPUs in parallel", k, len(selected), k))
var rampResult NvidiaBenchmarkResult
var rampIdleW, rampLoadedWSum float64
var rampIdleOK, rampLoadedOK bool
var rampLoadedSamples int
var rampMetricRows []GPUMetricRow
var rampTimelineSec float64
emptyCalib := make(map[int]benchmarkPowerCalibrationResult)
runNvidiaBenchmarkParallel(ctx, verboseLog, rampDir, subset, infoByIndex, opts, spec, logFunc,
&rampResult, emptyCalib,
&rampIdleW, &rampLoadedWSum, &rampIdleOK, &rampLoadedOK, &rampLoadedSamples,
&rampMetricRows, &rampTimelineSec, "")
var totalSynth, totalMixed float64
for _, g := range rampResult.GPUs {
totalSynth += g.Scores.SyntheticScore
totalMixed += g.Scores.MixedScore
}
scalPct := totalSynth / (float64(k) * bestTOPS) * 100
scalabilityPcts = append(scalabilityPcts, scalPct)
stepStatus := "OK"
if len(rampResult.GPUs) < k {
stepStatus = "PARTIAL"
}
rampSteps = append(rampSteps, NvidiaPerformanceRampStep{
StepIndex: k,
GPUIndices: subset,
TotalSyntheticTOPS: totalSynth,
TotalMixedTOPS: totalMixed,
ScalabilityPct: scalPct,
Status: stepStatus,
})
}
result.PerformanceRampSteps = rampSteps
result.PlatformPowerScore = benchmarkMean(scalabilityPcts)
if len(scalabilityPcts) > 0 {
result.ScalabilityScore = scalabilityPcts[len(scalabilityPcts)-1]
}
}
}
if len(selected) > 1 && opts.RunNCCL {
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
if result.Interconnect != nil && result.Interconnect.Supported {
for i := range result.GPUs {
result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
}
}
}
// Stop CPU load sampler and attach results.
close(cpuStopCh)
if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
result.CPULoad = summarizeCPULoad(cpuSamples)
if result.CPULoad != nil && result.CPULoad.Status != "ok" {
logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
}
}
// Compute server power characterization from accumulated IPMI samples.
var gpuReportedSumW float64
for _, gpu := range result.GPUs {
gpuReportedSumW += gpu.Steady.AvgPowerW
}
var serverLoadedW float64
if serverLoadedSamples > 0 {
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
}
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
result.Cooling = summarizeBenchmarkCooling(metricRows)
// Apply server-power penalty when IPMI reports the server delta is much
// lower than GPU-reported sum: GPU power telemetry is over-stated, making
// CalibratedPeakPowerW and PowerSustainScore unreliable.
// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
factor := sp.ReportingRatio / 0.75
for i := range result.GPUs {
result.GPUs[i].Scores.CompositeScore *= factor
result.GPUs[i].Notes = append(result.GPUs[i].Notes,
fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
sp.ReportingRatio, factor*100))
}
}
result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result)
writeBenchmarkMetricsFiles(runDir, metricRows)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal benchmark result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return "", fmt.Errorf("write result.json: %w", err)
}
report := renderBenchmarkReportWithCharts(result)
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err)
}
summary := renderBenchmarkSummary(result)
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
return "", fmt.Errorf("write summary.txt: %w", err)
}
return runDir, nil
}
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
case NvidiaBenchmarkProfileStability:
opts.Profile = NvidiaBenchmarkProfileStability
case NvidiaBenchmarkProfileOvernight:
opts.Profile = NvidiaBenchmarkProfileOvernight
default:
opts.Profile = NvidiaBenchmarkProfileStandard
}
if opts.SizeMB < 0 {
opts.SizeMB = 0
}
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
return opts
}
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
case NvidiaBenchmarkProfileOvernight:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
default:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
}
}
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
// Fields are tried in order; the first successful query wins. Extended fields
// (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails.
// The minimal fallback omits clock fields entirely — clocks.max.* returns
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
var benchmarkGPUInfoQueries = []struct {
fields string
extended bool // whether this query includes optional extended fields
minimal bool // clock fields omitted; max clocks must be filled separately
}{
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
extended: true,
},
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
extended: false,
},
{
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit",
minimal: true,
},
}
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
// any GPU in infoByIndex where those values are still zero. It parses the
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
// return exit status 2 but the verbose query works fine.
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
return
}
// Build bus_id → index map for matching verbose sections to GPU indices.
busToBenchIdx := make(map[string]int, len(infoByIndex))
for idx, info := range infoByIndex {
if info.BusID != "" {
// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
// while --query-gpu returns the same format; normalise to lower.
busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
}
}
// Split the verbose output into per-GPU sections on "^GPU " lines.
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
for i, loc := range sectionStarts {
busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
benchIdx, ok := busToBenchIdx[busID]
if !ok {
// Bus IDs from verbose output may have a different domain prefix;
// try suffix match on the slot portion (XX:XX.X).
for k, v := range busToBenchIdx {
if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
benchIdx = v
ok = true
break
}
}
}
if !ok {
continue
}
end := len(nvsmiQ)
if i+1 < len(sectionStarts) {
end = sectionStarts[i+1][0]
}
section := nvsmiQ[loc[0]:end]
info := infoByIndex[benchIdx]
if info.MaxGraphicsClockMHz == 0 {
if m := maxGfxRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
info.MaxGraphicsClockMHz = v
}
}
}
if info.MaxMemoryClockMHz == 0 {
if m := maxMemRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
info.MaxMemoryClockMHz = v
}
}
}
if info.DefaultPowerLimitW == 0 {
if m := defaultPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.DefaultPowerLimitW = v
}
}
}
if info.PowerLimitW == 0 {
if m := currentPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.PowerLimitW = v
}
}
}
if info.MultiprocessorCount == 0 {
if m := smCountRe.FindSubmatch(section); m != nil {
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
info.MultiprocessorCount = v
}
}
}
if info.ShutdownTempC == 0 {
if m := shutdownTempRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.ShutdownTempC = v
}
}
}
if info.SlowdownTempC == 0 {
if m := slowdownTempRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.SlowdownTempC = v
}
}
}
infoByIndex[benchIdx] = info
}
}
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
var lastErr error
for _, q := range benchmarkGPUInfoQueries {
args := []string{
"--query-gpu=" + q.fields,
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
}
out, err := satExecCommand("nvidia-smi", args...).Output()
if err != nil {
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
continue
}
r := csv.NewReader(strings.NewReader(string(out)))
r.TrimLeadingSpace = true
r.FieldsPerRecord = -1
rows, err := r.ReadAll()
if err != nil {
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
continue
}
minFields := 6
if !q.minimal {
minFields = 9
}
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
for _, row := range rows {
if len(row) < minFields {
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
if err != nil {
continue
}
info := benchmarkGPUInfo{
Index: idx,
UUID: strings.TrimSpace(row[1]),
Name: strings.TrimSpace(row[2]),
BusID: strings.TrimSpace(row[3]),
VBIOS: strings.TrimSpace(row[4]),
PowerLimitW: parseBenchmarkFloat(row[5]),
}
if !q.minimal {
info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
if len(row) >= 9 {
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
}
if q.extended {
if len(row) >= 10 {
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
}
if len(row) >= 11 {
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
}
}
}
infoByIndex[idx] = info
}
return infoByIndex, nil
}
return nil, lastErr
}
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
if os.Geteuid() != 0 {
result.Normalization.Status = "partial"
result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
for _, idx := range gpuIndices {
result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
Index: idx,
Notes: []string{"normalization skipped: root privileges are required"},
})
}
return nil
}
var restore []benchmarkRestoreAction
for _, idx := range gpuIndices {
rec := BenchmarkNormalizationGPU{Index: idx}
if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
rec.PersistenceMode = "failed"
rec.Notes = append(rec.Notes, "failed to enable persistence mode")
result.Normalization.Status = "partial"
} else {
rec.PersistenceMode = "applied"
}
if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
target := int(math.Round(info.MaxGraphicsClockMHz))
if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
rec.GPUClockLockStatus = "failed"
rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
result.Normalization.Status = "partial"
} else {
rec.GPUClockLockStatus = "applied"
rec.GPUClockLockMHz = float64(target)
idxCopy := idx
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
}})
}
} else {
rec.GPUClockLockStatus = "skipped"
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
result.Normalization.Status = "partial"
}
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
target := int(math.Round(info.MaxMemoryClockMHz))
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
switch {
case err == nil:
rec.MemoryClockLockStatus = "applied"
rec.MemoryClockLockMHz = float64(target)
idxCopy := idx
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
}})
case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
rec.MemoryClockLockStatus = "unsupported"
rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
result.Normalization.Status = "partial"
default:
rec.MemoryClockLockStatus = "failed"
rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
result.Normalization.Status = "partial"
}
}
result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
}
return restore
}
func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
if durationSec <= 0 {
return nil, nil
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var rows []GPUMetricRow
start := time.Now()
for {
if ctx.Err() != nil {
return rows, ctx.Err()
}
samples, err := sampleBenchmarkTelemetry(gpuIndices)
if err == nil {
elapsed := time.Since(start).Seconds()
for i := range samples {
samples[i].ElapsedSec = elapsed
}
rows = append(rows, samples...)
}
if time.Now().After(deadline) {
break
}
select {
case <-ctx.Done():
return rows, ctx.Err()
case <-time.After(time.Second):
}
}
return rows, nil
}
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
stopCh := make(chan struct{})
doneCh := make(chan struct{})
var metricRows []GPUMetricRow
start := time.Now()
go func() {
defer close(doneCh)
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
return
case <-ticker.C:
samples, err := sampleBenchmarkTelemetry(gpuIndices)
if err != nil {
continue
}
elapsed := time.Since(start).Seconds()
for i := range samples {
samples[i].ElapsedSec = elapsed
}
metricRows = append(metricRows, samples...)
}
}
}()
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
close(stopCh)
<-doneCh
return out, metricRows, err
}
type benchmarkPlannedPhase struct {
PlanLabel string
MetricStage string
DurationSec int
}
func runBenchmarkPlannedCommandWithMetrics(
ctx context.Context,
verboseLog, name string,
cmd []string,
env []string,
gpuIndices []int,
phases []benchmarkPlannedPhase,
logFunc func(string),
) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
}
func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
out := make(map[string][]GPUMetricRow, len(phases))
if len(rows) == 0 || len(phases) == 0 {
return out
}
for _, row := range rows {
idx := len(phases) - 1
var elapsed float64
for i, phase := range phases {
durationSec := phase.DurationSec
if durationSec <= 0 {
durationSec = 1
}
elapsed += float64(durationSec)
if row.ElapsedSec < elapsed {
idx = i
break
}
}
out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
}
return out
}
func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
out := make(map[string][]byte)
var current string
for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
switch {
case strings.HasPrefix(trimmed, "phase_begin="):
current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
case strings.HasPrefix(trimmed, "phase_end="):
current = ""
case current != "":
out[current] = append(out[current], []byte(line+"\n")...)
}
}
return out
}
type benchmarkCoolingSample struct {
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool
FanDutyCycleEstimated bool
}
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples, err := sampleGPUMetrics(gpuIndices)
if err != nil {
return nil, err
}
fanSample := sampleBenchmarkCoolingSample()
for i := range samples {
samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
samples[i].FanDutyCycleEstimated = fanSample.FanDutyCycleEstimated
}
return samples, nil
}
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable, dutyEstimated := sampleFanDutyCyclePctFromFans(fans)
return benchmarkCoolingSample{
AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable,
FanDutyCycleEstimated: dutyEstimated,
}
}
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow {
if len(rows) == 0 {
return nil
}
stageEnd := offset + durationSec
if stageEnd <= offset {
stageEnd = offset
for _, row := range rows {
if row.ElapsedSec+offset > stageEnd {
stageEnd = row.ElapsedSec + offset
}
}
}
out := make([]GPUMetricRow, len(rows))
for i, row := range rows {
row.Stage = stage
row.ElapsedSec += offset
row.StageStartSec = offset
row.StageEndSec = stageEnd
out[i] = row
}
return out
}
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) {
annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec)
*allRows = append(*allRows, annotated...)
*cursor += durationSec
}
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
if len(rows) == 0 {
return
}
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows)
}
func appendBenchmarkStageLog(path, source, stage string, raw []byte) {
if path == "" || len(raw) == 0 {
return
}
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()
header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage)
_, _ = f.WriteString(header)
if len(raw) > 0 {
_, _ = f.Write(raw)
if raw[len(raw)-1] != '\n' {
_, _ = f.WriteString("\n")
}
}
}
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
result := benchmarkBurnParseResult{}
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
profiles := make(map[string]*benchmarkBurnProfile)
for _, line := range lines {
line = stripBenchmarkPrefix(strings.TrimSpace(line))
if line == "" {
continue
}
switch {
case strings.HasPrefix(line, "device="):
result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
case strings.HasPrefix(line, "compute_capability="):
result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
case strings.HasPrefix(line, "backend="):
result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
result.Fallback = result.Backend == "driver-ptx"
case strings.HasPrefix(line, "duration_s="):
result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
default:
if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
profile := ensureBenchmarkProfile(profiles, m[1])
profile.supported = true
profile.lanes++
profile.m, _ = strconv.ParseUint(m[3], 10, 64)
profile.n, _ = strconv.ParseUint(m[4], 10, 64)
profile.k, _ = strconv.ParseUint(m[5], 10, 64)
continue
}
if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
profile := ensureBenchmarkProfile(profiles, m[1])
profile.supported = false
profile.notes = strings.TrimSpace(m[2])
continue
}
if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
profile := ensureBenchmarkProfile(profiles, m[1])
iters, _ := strconv.ParseUint(m[2], 10, 64)
profile.iterations += iters
}
}
}
keys := make([]string, 0, len(profiles))
for key := range profiles {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
profile := profiles[key]
precision := BenchmarkPrecisionResult{
Name: profile.name,
Category: profile.category,
Supported: profile.supported,
Lanes: profile.lanes,
M: profile.m,
N: profile.n,
K: profile.k,
Iterations: profile.iterations,
Notes: profile.notes,
}
w := precisionWeight(profile.category)
precision.Weight = w
if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w
}
result.Profiles = append(result.Profiles, precision)
}
return result
}
func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
if profile, ok := profiles[name]; ok {
return profile
}
category := "other"
switch {
case strings.HasPrefix(name, "fp64"):
category = "fp64"
case strings.HasPrefix(name, "fp32"):
category = "fp32_tf32"
case strings.HasPrefix(name, "fp16"):
category = "fp16_bf16"
case strings.HasPrefix(name, "int8"):
category = "int8"
case strings.HasPrefix(name, "fp8"):
category = "fp8"
case strings.HasPrefix(name, "fp4"):
category = "fp4"
}
profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
profiles[name] = profile
return profile
}
// precisionWeight returns the fp32-equivalence factor for a precision category.
// Each factor represents how much "real" numeric work one operation of that
// type performs relative to fp32 (single precision = 1.0 baseline):
//
// fp64 = 2.0 — double precision, 2× more bits per operand
// fp32 = 1.0 — single precision baseline
// fp16 = 0.5 — half precision
// int8 = 0.25 — quarter precision
// fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision
//
// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
// cross-precision comparison on the same numeric scale.
func precisionWeight(category string) float64 {
switch category {
case "fp64":
return 2.0
case "fp32_tf32":
return 1.0
case "fp16_bf16":
return 0.5
case "int8":
return 0.25
case "fp8":
return 0.25
case "fp4":
return 0.125
default:
return 1.0
}
}
func stripBenchmarkPrefix(line string) string {
if strings.HasPrefix(line, "[gpu ") {
if idx := strings.Index(line, "] "); idx >= 0 {
return line[idx+2:]
}
}
return line
}
func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
summary := BenchmarkTelemetrySummary{}
if len(rows) == 0 {
return summary
}
temps := make([]float64, 0, len(rows))
powers := make([]float64, 0, len(rows))
clocks := make([]float64, 0, len(rows))
memClocks := make([]float64, 0, len(rows))
usages := make([]float64, 0, len(rows))
memUsages := make([]float64, 0, len(rows))
summary.DurationSec = rows[len(rows)-1].ElapsedSec
summary.Samples = len(rows)
for _, row := range rows {
temps = append(temps, row.TempC)
powers = append(powers, row.PowerW)
clocks = append(clocks, row.ClockMHz)
memClocks = append(memClocks, row.MemClockMHz)
usages = append(usages, row.UsagePct)
memUsages = append(memUsages, row.MemUsagePct)
}
summary.AvgTempC = benchmarkMean(temps)
summary.P95TempC = benchmarkPercentile(temps, 95)
summary.AvgPowerW = benchmarkMean(powers)
summary.P95PowerW = benchmarkPercentile(powers, 95)
summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
summary.AvgUsagePct = benchmarkMean(usages)
summary.AvgMemUsagePct = benchmarkMean(memUsages)
summary.ClockCVPct = benchmarkCV(clocks)
summary.PowerCVPct = benchmarkCV(powers)
summary.TempCVPct = benchmarkCV(temps)
summary.ClockDriftPct = benchmarkClockDrift(clocks)
return summary
}
func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
if len(rows) == 0 {
return nil
}
var rpmValues []float64
var dutyValues []float64
var dutyEstimated bool
for _, row := range rows {
if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM)
}
if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct)
if row.FanDutyCycleEstimated {
dutyEstimated = true
}
}
}
if len(rpmValues) == 0 && len(dutyValues) == 0 {
return nil
}
summary := &BenchmarkCoolingSummary{
Available: true,
AvgFanRPM: benchmarkMean(rpmValues),
FanDutyCycleEstimated: dutyEstimated,
}
if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
if summary.FanDutyCycleEstimated {
summary.Notes = append(summary.Notes, "fan duty cycle is estimated from the highest fan RPM observed since boot; treat it as an approximation, not a direct PWM reading")
}
} else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
}
return summary
}
func benchmarkTelemetryAvailable(summary BenchmarkTelemetrySummary) bool {
return summary.Samples > 0 || summary.DurationSec > 0
}
func benchmarkPrecisionSteadyFallback(phases []BenchmarkPrecisionSteadyPhase) (BenchmarkTelemetrySummary, string, bool) {
var (
best BenchmarkTelemetrySummary
bestLabel string
found bool
)
for _, phase := range phases {
if !benchmarkTelemetryAvailable(phase.Steady) {
continue
}
if !found ||
phase.Steady.DurationSec > best.DurationSec ||
(phase.Steady.DurationSec == best.DurationSec && phase.Steady.P95PowerW > best.P95PowerW) {
best = phase.Steady
bestLabel = phase.Precision
found = true
}
}
return best, bestLabel, found
}
func applyBenchmarkSteadyFallback(gpu *BenchmarkGPUResult) {
if gpu == nil || benchmarkTelemetryAvailable(gpu.Steady) {
return
}
if fallback, label, ok := benchmarkPrecisionSteadyFallback(gpu.PrecisionSteady); ok {
gpu.Steady = fallback
gpu.Notes = append(gpu.Notes,
fmt.Sprintf("mixed steady telemetry unavailable; reporting steady-state fallback from %s precision phase", label))
}
}
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score := BenchmarkScorecard{}
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
// Each precision ran alone with full GPU dedicated — peak capability.
for _, p := range gpu.PrecisionSteady {
if !benchmarkPrecisionEnabled(p.Precision) {
continue
}
score.SyntheticScore += p.WeightedTeraOpsPerSec
}
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
// All precisions compete simultaneously — closer to real inference workloads.
for _, p := range gpu.PrecisionResults {
if p.Supported && benchmarkPrecisionEnabled(p.Category) {
score.MixedScore += p.WeightedTeraOpsPerSec
}
}
// MixedEfficiency = MixedScore / SyntheticScore.
// Measures how well the GPU sustains throughput under concurrent mixed load.
// A healthy GPU scores ~0.80.95; severe degradation suggests bandwidth
// contention or scheduler inefficiency.
if score.SyntheticScore > 0 && score.MixedScore > 0 {
score.MixedEfficiency = score.MixedScore / score.SyntheticScore
}
// ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3).
// SyntheticScore is the primary signal; MixedEfficiency adds up to +30%
// bonus for GPUs that handle mixed-precision concurrency well.
// Falls back to MixedScore alone when per-precision data is absent.
switch {
case score.SyntheticScore > 0:
score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3)
case score.MixedScore > 0:
score.ComputeScore = score.MixedScore
}
// PowerSustainScore: how stable is GPU power draw during the benchmark?
// High variance means the workload is bursting or the power delivery is
// unstable. Score = max(0, 100 PowerCVPct × 3).
// At 10% CV → score 70; at 33%+ CV → score 0.
// Uses per-precision windows when available (each runs a single kernel,
// so CV reflects genuine power regulation, not workload switching).
if len(gpu.PrecisionSteady) > 0 {
var sum float64
var count int
for _, p := range gpu.PrecisionSteady {
if !benchmarkPrecisionEnabled(p.Precision) {
continue
}
sum += clampScore(100 - p.Steady.PowerCVPct*3)
count++
}
if count > 0 {
score.PowerSustainScore = sum / float64(count)
}
} else if gpu.Steady.PowerCVPct > 0 {
score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
}
// ThermalSustainScore: how stable is GPU temperature during the benchmark?
// High variance means cooling is inconsistent (fan bursts, liquid flow
// instability, or frequent transitions in and out of throttle).
// Score = max(0, 100 TempCVPct × 3).
if gpu.Steady.TempCVPct > 0 {
score.ThermalSustainScore = clampScore(100 - gpu.Steady.TempCVPct*3)
} else {
// TempCV not recorded — fall back to 100 (no penalty).
score.ThermalSustainScore = 100
}
// Throttle breakdown: compute per-type percentages for diagnosis.
// Each counter measures microseconds spent in that throttle state during
// the steady-state window. Counters can overlap (e.g. thermal + power cap
// simultaneously), so they are reported independently, not summed.
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
score.ThermalThrottlePct = math.Min(100,
float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS*100)
score.PowerCapThrottlePct = math.Min(100,
float64(gpu.Throttle.SWPowerCapUS)/runtimeUS*100)
score.SyncBoostThrottlePct = math.Min(100,
float64(gpu.Throttle.SyncBoostUS)/runtimeUS*100)
// StabilityScore: combined throttle signal (thermal + power cap).
// Score = max(0, 100 combined_throttle_pct).
// 1% throttle → 99; 10% → 90; any throttle > 0 is penalised.
combinedThrottlePct := math.Min(100,
float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS+gpu.Throttle.SWPowerCapUS)/runtimeUS*100)
score.StabilityScore = clampScore(100 - combinedThrottlePct)
// TempHeadroomC: distance from p95 temperature to the GPU's hardware
// shutdown threshold (sourced from nvidia-smi -q "GPU Shutdown Temp").
// Fallback: 90°C when not available.
// Assessed independently of throttle — a GPU at 86°C without any throttle
// counter still has limited headroom and operates in degraded reliability zone.
// Warning zone: headroom < (shutdownTemp - slowdownTemp), i.e. past slowdown onset.
// Critical zone: headroom < 10°C from shutdown.
if gpu.Steady.P95TempC > 0 {
shutdownTemp := gpu.ShutdownTempC
if shutdownTemp <= 0 {
shutdownTemp = 90
}
score.TempHeadroomC = shutdownTemp - gpu.Steady.P95TempC
}
score.ServerQualityScore = serverQualityScore(score)
score.CompositeScore = score.ComputeScore
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
}
return score
}
// compositeBenchmarkScore is kept for compatibility with legacy callers.
// CompositeScore = ComputeScore (no quality multiplier; throttling already
// reduces TOPS directly, so no additional penalty is needed).
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
return score.ComputeScore
}
// serverQualityScore returns a 0100 score reflecting server infrastructure
// quality, independent of GPU model or compute speed.
//
// StabilityScore (throttle time) 0.40 — heaviest: direct evidence GPU can't sustain load
// PowerSustainScore (power CV) 0.30 — unstable draw hints at PSU/VRM issues
// ThermalSustainScore (temp CV) 0.30 — unstable temp hints at airflow/cooling issues
func serverQualityScore(score BenchmarkScorecard) float64 {
q := 0.40*(score.StabilityScore/100.0) +
0.30*(score.PowerSustainScore/100.0) +
0.30*(score.ThermalSustainScore/100.0)
return clampScore(q * 100)
}
// detectPowerAnomaly scans per-GPU steady-state metric rows for a sudden
// power drop — a symptom of bad cable contact, VRM fault, or thermal event
// on the power delivery path. Returns a non-empty string if an anomaly is found.
//
// Algorithm: uses a 5-sample rolling baseline; flags any sample that falls
// more than 30% below the baseline while the GPU was otherwise loaded
// (usage > 50%). A sustained throttle (power cap) is not flagged here —
// that is already captured by PowerCapThrottlePct.
func detectPowerAnomaly(rows []GPUMetricRow, gpuIndex int) string {
const windowSize = 5
const dropThresholdPct = 30.0
const minUsagePct = 50.0
// Filter rows for this GPU during steady state only.
var steady []GPUMetricRow
for _, r := range rows {
if r.GPUIndex == gpuIndex && r.Stage != "" && strings.Contains(r.Stage, "steady") {
steady = append(steady, r)
}
}
if len(steady) < windowSize+2 {
return ""
}
// Compute initial baseline from the first window.
var baseSum float64
for i := 0; i < windowSize; i++ {
baseSum += steady[i].PowerW
}
for i := windowSize; i < len(steady); i++ {
baseline := baseSum / float64(windowSize)
sample := steady[i]
if baseline > 0 && sample.UsagePct >= minUsagePct {
dropPct := (baseline - sample.PowerW) / baseline * 100
if dropPct >= dropThresholdPct {
return fmt.Sprintf("sudden power drop detected at t=%.0fs: %.0f W → %.0f W (%.0f%% below rolling baseline) — possible bad cable contact or VRM fault",
sample.ElapsedSec, baseline, sample.PowerW, dropPct)
}
}
// Slide the window baseline.
baseSum -= steady[i-windowSize].PowerW
baseSum += sample.PowerW
}
return ""
}
// detectSlowdownTempExceedance scans steady-state metric rows for a GPU and
// returns a warning string if any temperature sample exceeded the GPU's
// SlowdownTempC threshold. Uses fallback 80°C when SlowdownTempC is zero.
// This is a real-time signal distinct from p95 stats — even a single spike
// above the slowdown threshold is worth flagging.
func detectSlowdownTempExceedance(rows []GPUMetricRow, gpuIndex int, slowdownTempC float64) string {
if slowdownTempC <= 0 {
slowdownTempC = 80
}
var maxTemp float64
var exceedCount int
for _, r := range rows {
if r.GPUIndex != gpuIndex {
continue
}
if !strings.Contains(r.Stage, "steady") {
continue
}
if r.TempC > maxTemp {
maxTemp = r.TempC
}
if r.TempC >= slowdownTempC {
exceedCount++
}
}
if exceedCount == 0 {
return ""
}
return fmt.Sprintf(
"temperature exceeded slowdown threshold (%.0f°C) in %d sample(s) during steady state — peak %.1f°C",
slowdownTempC, exceedCount, maxTemp)
}
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
var reasons []string
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
reasons = append(reasons, "power_capped")
}
if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
reasons = append(reasons, "thermal_limited")
}
if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
reasons = append(reasons, "sync_boost_limited")
}
if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
reasons = append(reasons, "low_sm_clock_vs_target")
}
if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
reasons = append(reasons, "variance_too_high")
}
if normalizationStatus != "full" {
reasons = append(reasons, "normalization_partial")
}
if gpu.PowerLimitDerated {
reasons = append(reasons, "power_limit_derated")
}
if gpu.ECC.Uncorrected > 0 {
reasons = append(reasons, "ecc_uncorrected_errors")
}
if gpu.ECC.Corrected > 0 {
reasons = append(reasons, "ecc_corrected_errors")
}
return dedupeStrings(reasons)
}
func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
result := &BenchmarkInterconnectResult{
Status: "UNSUPPORTED",
Attempted: true,
SelectedGPUIndices: append([]int(nil), gpuIndices...),
}
cmd := []string{
"all_reduce_perf",
"-b", "512M",
"-e", "4G",
"-f", "2",
"-g", strconv.Itoa(len(gpuIndices)),
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
}
env := []string{
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
}
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
if err != nil {
result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
return result
}
avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
result.Status = "OK"
result.Supported = true
result.AvgAlgBWGBps = avgAlg
result.MaxAlgBWGBps = maxAlg
result.AvgBusBWGBps = avgBus
result.MaxBusBWGBps = maxBus
return result
}
func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
var algs []float64
var buses []float64
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
fields := strings.Fields(line)
if len(fields) < 8 {
continue
}
for i := 0; i+2 < len(fields); i++ {
timeVal, err1 := strconv.ParseFloat(fields[i], 64)
algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
algs = append(algs, algVal)
buses = append(buses, busVal)
break
}
}
}
if len(algs) == 0 {
return 0, 0, 0, 0
}
return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
}
func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
out, err := satExecCommand(
"nvidia-smi",
"--id="+strconv.Itoa(gpuIndex),
"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return BenchmarkThrottleCounters{}, err
}
fields := strings.Split(strings.TrimSpace(string(out)), ",")
if len(fields) < 5 {
return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
}
return BenchmarkThrottleCounters{
SWPowerCapUS: parseBenchmarkUint64(fields[0]),
SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]),
SyncBoostUS: parseBenchmarkUint64(fields[2]),
HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]),
HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
}, nil
}
func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
return BenchmarkThrottleCounters{
SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
}
}
func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) {
out, err := satExecCommand(
"nvidia-smi",
"--id="+strconv.Itoa(gpuIndex),
"--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return BenchmarkECCCounters{}, err
}
fields := strings.Split(strings.TrimSpace(string(out)), ",")
if len(fields) < 2 {
return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out)))
}
corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64)
uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64)
if err1 != nil || err2 != nil {
// ECC may be disabled on this GPU — return zero counters silently.
return BenchmarkECCCounters{}, nil
}
return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil
}
func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters {
return BenchmarkECCCounters{
Corrected: saturatingSub(after.Corrected, before.Corrected),
Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected),
}
}
func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
args := []string{
"--query-compute-apps=gpu_uuid,pid,process_name",
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
}
out, err := satExecCommand("nvidia-smi", args...).Output()
if err != nil {
return nil, err
}
var lines []string
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
lines = append(lines, line)
}
return lines, nil
}
func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
if gpu.Status == "" {
gpu.Status = "OK"
}
if gpu.Scores.CompositeScore == 0 {
gpu.Scores.CompositeScore = gpu.Scores.ComputeScore
}
return gpu
}
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
var findings []string
passed := 0
for _, gpu := range result.GPUs {
if gpu.Status == "OK" {
passed++
}
}
total := len(result.GPUs)
if total > 0 {
if passed == total {
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
} else {
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
}
}
if result.Normalization.Status != "full" {
findings = append(findings, "Environment normalization was partial; compare results with caution.")
}
for _, gpu := range result.GPUs {
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
continue
}
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
continue
}
for _, reason := range gpu.DegradationReasons {
switch reason {
case "power_capped":
findings = append(findings, fmt.Sprintf(
"[POWER] GPU %d: power cap throttle %.1f%% of steady state — server is not delivering full TDP to the GPU.",
gpu.Index, gpu.Scores.PowerCapThrottlePct))
case "thermal_limited":
// Hard stop check: thermal throttle while fans are not at maximum.
// This means the server does not see GPU thermals — incompatible config.
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
result.Cooling.P95FanDutyCyclePct < 95 {
findings = append(findings, fmt.Sprintf(
"[HARD STOP] GPU %d: thermal throttle (%.1f%% of time) while fans peaked at only %.0f%% duty cycle — server cooling is not responding to GPU heat load. Configuration is likely incompatible.",
gpu.Index, gpu.Scores.ThermalThrottlePct, result.Cooling.P95FanDutyCyclePct))
} else {
findings = append(findings, fmt.Sprintf(
"[THERMAL] GPU %d: thermal throttle %.1f%% of steady state.",
gpu.Index, gpu.Scores.ThermalThrottlePct))
}
case "sync_boost_limited":
findings = append(findings, fmt.Sprintf(
"[SYNC] GPU %d: sync boost throttle %.1f%% of steady state — GPUs are constraining each other's clocks.",
gpu.Index, gpu.Scores.SyncBoostThrottlePct))
case "low_sm_clock_vs_target":
findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
case "variance_too_high":
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
case "normalization_partial":
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
case "power_limit_derated":
findings = append(findings, fmt.Sprintf("[POWER] GPU %d could not sustain full TDP in this server; benchmark ran at reduced limit %.0f W.", gpu.Index, gpu.PowerLimitW))
case "ecc_uncorrected_errors":
findings = append(findings, fmt.Sprintf(
"[HARD STOP] GPU %d: %d uncorrected ECC error(s) detected — possible hardware fault. Do not use in production.",
gpu.Index, gpu.ECC.Uncorrected))
case "ecc_corrected_errors":
findings = append(findings, fmt.Sprintf(
"[WARNING] GPU %d: %d corrected ECC error(s) — possible DRAM degradation, monitor closely.",
gpu.Index, gpu.ECC.Corrected))
}
}
// Temperature headroom checks — independent of throttle counters.
// Shutdown and slowdown thresholds are per-GPU from nvidia-smi -q;
// fall back to 90°C / 80°C when unavailable.
if gpu.Steady.P95TempC > 0 {
shutdownTemp := gpu.ShutdownTempC
if shutdownTemp <= 0 {
shutdownTemp = 90
}
slowdownTemp := gpu.SlowdownTempC
if slowdownTemp <= 0 {
slowdownTemp = 80
}
headroom := shutdownTemp - gpu.Steady.P95TempC
switch {
case headroom < 10:
findings = append(findings, fmt.Sprintf(
"[HARD STOP] GPU %d: p95 temperature %.1f°C — only %.1f°C from shutdown threshold (%.0f°C). Do not operate.",
gpu.Index, gpu.Steady.P95TempC, headroom, shutdownTemp))
case gpu.Steady.P95TempC >= slowdownTemp:
findings = append(findings, fmt.Sprintf(
"[THERMAL] GPU %d: p95 temperature %.1f°C exceeds slowdown threshold (%.0f°C) — %.1f°C headroom to shutdown. Operating in degraded reliability zone.",
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, headroom))
}
}
if gpu.CoolingWarning != "" {
findings = append(findings, fmt.Sprintf(
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
gpu.Index, gpu.CoolingWarning,
))
}
if len(gpu.PrecisionFailures) > 0 {
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
}
if gpu.Backend == "driver-ptx" {
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
}
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
findings = append(findings, fmt.Sprintf(
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
))
}
// Flag significant TDP deviation (over or under) from calibration.
if gpu.CalibratedPeakPowerW > 0 {
ref := gpu.DefaultPowerLimitW
if ref <= 0 {
ref = gpu.PowerLimitW
}
if ref > 0 {
deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100
switch {
case deviationPct < -10:
findings = append(findings, fmt.Sprintf(
"GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.",
gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref,
))
case deviationPct > 5:
findings = append(findings, fmt.Sprintf(
"GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.",
gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct,
))
}
}
}
}
if result.Interconnect != nil && result.Interconnect.Supported {
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
}
if cl := result.CPULoad; cl != nil {
switch cl.Status {
case "high":
findings = append(findings, fmt.Sprintf(
"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
cl.AvgPct, cl.MaxPct,
))
case "unstable":
findings = append(findings, fmt.Sprintf(
"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
cl.AvgPct, cl.P95Pct,
))
}
}
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
if sp.ReportingRatio < 0.75 {
findings = append(findings, fmt.Sprintf(
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
))
} else if sp.ReportingRatio > 1.25 {
findings = append(findings, fmt.Sprintf(
"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
))
}
}
return dedupeStrings(findings)
}
func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
if len(result.GPUs) == 0 {
return "FAILED"
}
hasOK := false
hasPartial := result.Normalization.Status != "full"
for _, gpu := range result.GPUs {
switch gpu.Status {
case "OK":
hasOK = true
case "PARTIAL", "UNSUPPORTED":
hasPartial = true
}
}
if !hasOK {
return "FAILED"
}
if hasPartial {
return "PARTIAL"
}
return "OK"
}
func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
for i := range items {
if items[i].Index == idx {
return &items[i]
}
}
return nil
}
func classifySATErrorStatus(out []byte, err error) string {
status, _ := classifySATResult("benchmark", out, err)
if status == "UNSUPPORTED" {
return "UNSUPPORTED"
}
return "FAILED"
}
func parseBenchmarkFloat(raw string) float64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
return 0
}
value, _ := strconv.ParseFloat(raw, 64)
return value
}
func parseBenchmarkUint64(raw string) uint64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
return 0
}
value, _ := strconv.ParseUint(raw, 10, 64)
return value
}
func benchmarkMean(values []float64) float64 {
if len(values) == 0 {
return 0
}
var sum float64
for _, value := range values {
sum += value
}
return sum / float64(len(values))
}
func benchmarkPercentile(values []float64, p float64) float64 {
if len(values) == 0 {
return 0
}
copyValues := append([]float64(nil), values...)
sort.Float64s(copyValues)
if len(copyValues) == 1 {
return copyValues[0]
}
rank := (p / 100.0) * float64(len(copyValues)-1)
lower := int(math.Floor(rank))
upper := int(math.Ceil(rank))
if lower == upper {
return copyValues[lower]
}
frac := rank - float64(lower)
return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
}
func benchmarkCV(values []float64) float64 {
if len(values) == 0 {
return 0
}
mean := benchmarkMean(values)
if mean == 0 {
return 0
}
var variance float64
for _, value := range values {
diff := value - mean
variance += diff * diff
}
variance /= float64(len(values))
return math.Sqrt(variance) / mean * 100
}
func benchmarkClockDrift(values []float64) float64 {
if len(values) < 4 {
return 0
}
window := len(values) / 4
if window < 1 {
window = 1
}
head := benchmarkMean(values[:window])
tail := benchmarkMean(values[len(values)-window:])
if head <= 0 || tail >= head {
return 0
}
return ((head - tail) / head) * 100
}
func benchmarkMax(values []float64) float64 {
var max float64
for i, value := range values {
if i == 0 || value > max {
max = value
}
}
return max
}
func clampScore(value float64) float64 {
switch {
case value < 0:
return 0
case value > 100:
return 100
default:
return value
}
}
func dedupeStrings(values []string) []string {
if len(values) == 0 {
return nil
}
seen := make(map[string]struct{}, len(values))
out := make([]string, 0, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
return out
}
func saturatingSub(after, before uint64) uint64 {
if after <= before {
return 0
}
return after - before
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading")
out, err := cmd.Output()
if err != nil {
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
}
if w := parseDCMIPowerReading(string(out)); w > 0 {
return w, nil
}
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
}
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
// durationSec seconds. Returns the mean of all successful samples.
// Returns 0, false if IPMI is unavailable.
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
if durationSec <= 0 {
return 0, false
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var samples []float64
loop:
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
if time.Now().After(deadline) {
break
}
select {
case <-ctx.Done():
break loop
case <-time.After(2 * time.Second):
}
}
if len(samples) == 0 {
return 0, false
}
var sum float64
for _, w := range samples {
sum += w
}
return sum / float64(len(samples)), true
}
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
// IPMI samples plus the GPU-reported average power during steady state.
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
sp := &BenchmarkServerPower{Available: ipmiAvailable}
if !ipmiAvailable {
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
return sp
}
sp.IdleW = idleW
sp.LoadedW = loadedW
sp.DeltaW = loadedW - idleW
sp.GPUReportedSumW = gpuReportedSumW
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
}
return sp
}
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
// Returns empty string if unavailable (non-Linux or missing DMI entry).
func readServerModel() string {
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}
// filterRowsByGPU returns only the metric rows for a specific GPU index.
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
var out []GPUMetricRow
for _, r := range rows {
if r.GPUIndex == gpuIndex {
out = append(out, r)
}
}
return out
}
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
// and returns a per-GPU parse result map.
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
gpuLines := make(map[int][]string)
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "[gpu ") {
continue
}
end := strings.Index(line, "] ")
if end < 0 {
continue
}
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
if err != nil {
continue
}
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
}
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
for gpuIdx, lines := range gpuLines {
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
}
return results
}
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
// simultaneously using a single bee-gpu-burn invocation per phase.
func runNvidiaBenchmarkParallel(
ctx context.Context,
verboseLog, runDir string,
selected []int,
infoByIndex map[int]benchmarkGPUInfo,
opts NvidiaBenchmarkOptions,
spec benchmarkProfileSpec,
logFunc func(string),
result *NvidiaBenchmarkResult,
calibByIndex map[int]benchmarkPowerCalibrationResult,
serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
allMetricRows *[]GPUMetricRow,
metricTimelineSec *float64,
gpuBurnLog string,
) {
allDevices := joinIndexList(selected)
// Build per-GPU result stubs.
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
for _, idx := range selected {
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
if info, ok := infoByIndex[idx]; ok {
r.UUID = info.UUID
r.Name = info.Name
r.BusID = info.BusID
r.VBIOS = info.VBIOS
r.PowerLimitW = info.PowerLimitW
r.MultiprocessorCount = info.MultiprocessorCount
r.DefaultPowerLimitW = info.DefaultPowerLimitW
r.ShutdownTempC = info.ShutdownTempC
r.SlowdownTempC = info.SlowdownTempC
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if calib, ok := calibByIndex[idx]; ok {
r.CalibratedPeakPowerW = calib.Summary.P95PowerW
r.CalibratedPeakTempC = calib.Summary.P95TempC
r.PowerCalibrationTries = calib.Attempts
r.PowerLimitDerated = calib.Derated
r.Notes = append(r.Notes, calib.Notes...)
if calib.CoolingWarning != "" {
r.CoolingWarning = calib.CoolingWarning
}
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
gpuResults[idx] = r
}
// Baseline: sample all GPUs together.
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
}
}
for _, idx := range selected {
perGPU := filterRowsByGPU(baselineRows, idx)
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec))
// Sample server idle power once.
if !*serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
*serverIdleW = w
*serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
// Warmup: all GPUs simultaneously.
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
if warmupErr != nil {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
}
}
warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut))
supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...)
for _, idx := range selected {
if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" {
if gpuResults[idx].ComputeCapability == "" {
gpuResults[idx].ComputeCapability = pr.ComputeCapability
}
if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) {
supportedPrecisions = ccPrecisions
}
}
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPUs stay hot between windows.
eccBase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccBase[idx], _ = queryECCCounters(idx)
}
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
if label == "mixed" {
return "steady"
}
return "gpu-all-steady-" + label
})
planCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(basePhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
"--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range supportedPrecisions {
phaseLogName := "gpu-all-steady-" + prec
phaseRows := phaseRowsByStage[phaseLogName]
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
for _, idx := range selected {
perGPU := filterRowsByGPU(phaseRows, idx)
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Status: "OK",
Steady: summarizeBenchmarkTelemetry(perGPU),
}
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
phase.Status = status
phase.Notes = note
gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status)
}
if pr, ok := parseByGPU[idx]; ok {
for _, p := range pr.Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
}
gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase)
}
}
// Snapshot throttle counters before steady.
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
beforeThrottle[idx], _ = queryThrottleCounters(idx)
}
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
// Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW
(*serverLoadedSamples)++
*serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
}
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
afterThrottle[idx], _ = queryThrottleCounters(idx)
}
steadyRows := phaseRowsByStage["steady"]
parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))
for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx)
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal)
}
if pr, ok := parseResults[idx]; ok {
gpuResults[idx].ComputeCapability = pr.ComputeCapability
gpuResults[idx].Backend = pr.Backend
gpuResults[idx].PrecisionResults = pr.Profiles
if pr.Fallback {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
}
if planErr != nil {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
}
}
// Cooldown: all GPUs together.
if spec.CooldownSec > 0 {
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
}
}
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec))
}
// Score and finalize each GPU.
for _, idx := range selected {
r := gpuResults[idx]
applyBenchmarkSteadyFallback(r)
r.Scores = scoreBenchmarkGPUResult(*r)
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
pr := parseResults[idx]
switch {
case planErr != nil:
r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
case len(r.PrecisionFailures) > 0:
r.Status = "PARTIAL"
case pr.Fallback:
r.Status = "PARTIAL"
default:
r.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
}
}
// readBenchmarkHostConfig reads static CPU and memory configuration from
// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
func readBenchmarkHostConfig() *BenchmarkHostConfig {
cfg := &BenchmarkHostConfig{}
populated := false
// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
socketIDs := map[string]struct{}{}
coresPerSocket := map[string]int{}
var modelName string
threads := 0
for _, line := range strings.Split(string(data), "\n") {
kv := strings.SplitN(line, ":", 2)
if len(kv) != 2 {
continue
}
key := strings.TrimSpace(kv[0])
val := strings.TrimSpace(kv[1])
switch key {
case "processor":
threads++
case "model name":
if modelName == "" {
modelName = val
}
case "physical id":
socketIDs[val] = struct{}{}
case "cpu cores":
// Overwrite per-socket core count (last wins per socket, but all
// entries for the same socket report the same value).
if physLine := ""; physLine == "" {
// We accumulate below by treating cpu cores as a per-thread
// field; sum by socket requires a two-pass approach. Use the
// simpler approximation: totalCores = threads / (threads per core).
_ = val
}
}
}
// Second pass: per-socket core count.
var curSocket string
for _, line := range strings.Split(string(data), "\n") {
kv := strings.SplitN(line, ":", 2)
if len(kv) != 2 {
continue
}
key := strings.TrimSpace(kv[0])
val := strings.TrimSpace(kv[1])
switch key {
case "physical id":
curSocket = val
case "cpu cores":
if curSocket != "" {
if _, seen := coresPerSocket[curSocket]; !seen {
v, _ := strconv.Atoi(val)
coresPerSocket[curSocket] = v
}
}
}
}
totalCores := 0
for _, c := range coresPerSocket {
totalCores += c
}
cfg.CPUModel = modelName
cfg.CPUSockets = len(socketIDs)
if cfg.CPUSockets == 0 && threads > 0 {
cfg.CPUSockets = 1
}
cfg.CPUCores = totalCores
cfg.CPUThreads = threads
if modelName != "" || threads > 0 {
populated = true
}
}
// Parse /proc/meminfo for total physical RAM.
if data, err := os.ReadFile("/proc/meminfo"); err == nil {
for _, line := range strings.Split(string(data), "\n") {
if strings.HasPrefix(line, "MemTotal:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
kb, _ := strconv.ParseUint(fields[1], 10, 64)
cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
populated = true
}
break
}
}
}
if !populated {
return nil
}
return cfg
}
// startCPULoadSampler starts a goroutine that samples host CPU load every
// intervalSec seconds until stopCh is closed, then sends the collected
// samples on the returned channel.
func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
ch := make(chan []float64, 1)
go func() {
var samples []float64
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- samples
return
case <-ticker.C:
if pct := sampleCPULoadPct(); pct > 0 {
samples = append(samples, pct)
}
}
}
}()
return ch
}
// summarizeCPULoad computes stats over sampled CPU load values and assigns
// a health status.
func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
if len(samples) == 0 {
return nil
}
sorted := append([]float64(nil), samples...)
sort.Float64s(sorted)
var sum float64
for _, v := range sorted {
sum += v
}
avg := sum / float64(len(sorted))
p95 := sorted[int(float64(len(sorted))*0.95)]
max := sorted[len(sorted)-1]
cl := &BenchmarkCPULoad{
AvgPct: math.Round(avg*10) / 10,
MaxPct: math.Round(max*10) / 10,
P95Pct: math.Round(p95*10) / 10,
Samples: len(sorted),
}
// Compute standard deviation to detect instability.
var variance float64
for _, v := range sorted {
d := v - avg
variance += d * d
}
stdDev := math.Sqrt(variance / float64(len(sorted)))
switch {
case avg > 20 || max > 40:
cl.Status = "high"
cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
case stdDev > 12:
cl.Status = "unstable"
cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
default:
cl.Status = "ok"
}
return cl
}
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
// actively watches throttle counters. seedLimits, when provided, are treated as
// the starting point for this calibration pass rather than as immutable fixed
// limits. This matters during cumulative ramp-up: once an additional GPU is
// introduced, every already-active GPU must be revalidated under the new
// thermal state instead of assuming its previous single-step limit is still
// valid. The selected reduced power limits stay active for the main benchmark
// and are restored by the caller afterwards.
func runBenchmarkPowerCalibration(
ctx context.Context,
verboseLog, runDir string,
gpuIndices []int,
infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string),
seedLimits map[int]int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
const calibDurationSec = 120
const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
const calibSearchTolerance = 10
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
// doubling each retry until it would exceed the cap, at which point the
// next busy response fails the calibration immediately.
const dcgmResourceBusyMaxDelaySec = 300
if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]benchmarkPowerCalibrationResult{}, nil
}
if killed := KillTestWorkers(); len(killed) > 0 {
for _, p := range killed {
logFunc(fmt.Sprintf("power calibration pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
}
}
canDerate := os.Geteuid() == 0
if !canDerate {
logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled")
}
type calibrationAttemptResult struct {
out []byte
rows []GPUMetricRow
err error
}
// gpuCalibState holds per-GPU binary search state during parallel calibration.
type gpuCalibState struct {
idx int
info benchmarkGPUInfo
originalLimitW int
appliedLimitW int
minLimitW int
lo int // highest verified-stable limit (assumed: minLimitW)
hi int // lowest verified-unstable limit (exclusive sentinel above start)
calib benchmarkPowerCalibrationResult
converged bool
}
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
var restore []benchmarkRestoreAction
// Initialise per-GPU state.
states := make([]*gpuCalibState, 0, len(gpuIndices))
for _, idx := range gpuIndices {
info := infoByIndex[idx]
originalLimitW := int(math.Round(info.PowerLimitW))
if originalLimitW <= 0 {
originalLimitW = int(math.Round(info.DefaultPowerLimitW))
}
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
if defaultLimitW <= 0 {
defaultLimitW = originalLimitW
}
appliedLimitW := originalLimitW
if appliedLimitW <= 0 {
appliedLimitW = defaultLimitW
}
minLimitW := appliedLimitW
switch {
case defaultLimitW > 0:
minLimitW = defaultLimitW - maxDerateW
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
if minLimitW < floorByRatio {
minLimitW = floorByRatio
}
case appliedLimitW > 0:
minLimitW = appliedLimitW - maxDerateW
}
if minLimitW < calibSearchTolerance {
minLimitW = calibSearchTolerance
}
s := &gpuCalibState{
idx: idx,
info: info,
originalLimitW: originalLimitW,
appliedLimitW: appliedLimitW,
minLimitW: minLimitW,
lo: minLimitW,
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
}
if seedLimits != nil {
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
// A previously validated limit is only a starting point. Re-run
// targeted_power under the current multi-GPU thermal load and derate
// again if this step shows new throttling.
if canDerate {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
}
s.appliedLimitW = seedW
s.hi = seedW + 1
s.calib.AppliedPowerLimitW = float64(seedW)
s.calib.Derated = seedW < s.originalLimitW
s.calib.Notes = append(s.calib.Notes,
fmt.Sprintf("seed limit: %d W (revalidating under current thermal load)", seedW))
}
}
states = append(states, s)
if canDerate && originalLimitW > 0 {
idxCopy := idx
orig := originalLimitW
restore = append(restore, benchmarkRestoreAction{
name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy),
fn: func() {
_ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig)
},
})
}
}
// Shared DCGM resource-busy back-off state (single diagnostic session).
busyRetries := 0
busyDelaySec := 1
sharedAttempt := 0
type sharedAttemptResult struct {
out []byte
rows []GPUMetricRow
err error
}
calibDone:
for {
// Collect non-converged GPUs.
var active []*gpuCalibState
for _, s := range states {
if !s.converged {
active = append(active, s)
}
}
if len(active) == 0 || ctx.Err() != nil {
break
}
sharedAttempt++
for _, s := range active {
s.calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
}
// Snapshot throttle counters for all active GPUs before the run.
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
for _, s := range active {
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
}
// Run targeted_power for ALL gpuIndices simultaneously so every card
// is under load during calibration — this reflects real server thermals.
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
attemptCtx, cancelAttempt := context.WithCancel(ctx)
doneCh := make(chan sharedAttemptResult, 1)
go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
}()
ticker := time.NewTicker(time.Second)
throttleReasons := make(map[int]string, len(active))
var ar sharedAttemptResult
attemptLoop:
for {
select {
case ar = <-doneCh:
break attemptLoop
case <-ticker.C:
// Poll throttle counters for each active GPU independently.
for _, s := range active {
if throttleReasons[s.idx] != "" {
continue // already detected for this GPU
}
after, err := queryThrottleCounters(s.idx)
if err != nil {
continue
}
// Record throttle but do NOT cancel — let dcgmi finish so
// nv-hostengine releases the slot cleanly before the next attempt.
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
throttleReasons[s.idx] = reason
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
}
}
case <-ctx.Done():
cancelAttempt()
ar = <-doneCh
break attemptLoop
}
}
ticker.Stop()
cancelAttempt()
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
// Resource busy: retry with exponential back-off (shared — one DCGM session).
if ar.err != nil && isDCGMResourceBusy(ar.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
for _, s := range active {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
s.converged = true
}
logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
break calibDone
}
busyRetries++
// Undo attempt counter: busy retries don't count as real attempts.
for _, s := range active {
s.calib.Attempts--
}
logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
select {
case <-ctx.Done():
break calibDone
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1
}
busyDelaySec = next
sharedAttempt-- // retry same logical attempt number
continue
}
busyRetries = 0
busyDelaySec = 1
// Per-GPU analysis and binary search update.
for _, s := range active {
perGPU := filterRowsByGPU(ar.rows, s.idx)
summary := summarizeBenchmarkTelemetry(perGPU)
throttle := throttleReasons[s.idx]
// Cooling warning: thermal throttle with fans not at maximum.
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
clocks := make([]float64, 0, len(perGPU))
var fanDutyValues []float64
fanDutyAvail := false
for _, r := range perGPU {
if r.ClockMHz > 0 {
clocks = append(clocks, r.ClockMHz)
}
if r.FanDutyCycleAvailable {
fanDutyAvail = true
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
}
}
dropPct := benchmarkClockDrift(clocks)
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
s.calib.CoolingWarning = fmt.Sprintf(
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
throttle, dropPct, p95FanDuty,
)
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
}
}
if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
// Stable at current limit — update lo and binary-search upward.
s.calib.Summary = summary
s.calib.Completed = true
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
s.lo = s.appliedLimitW
if canDerate && s.hi-s.lo > calibSearchTolerance {
next := roundTo5W((s.lo + s.hi) / 2)
if next > s.lo && next < s.hi {
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
s.appliedLimitW = next
s.calib.AppliedPowerLimitW = float64(next)
s.calib.Completed = false // keep searching
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
continue // next GPU in active list
}
}
}
s.converged = true
continue
}
// Failed or throttled — log and binary-search downward.
switch {
case throttle != "":
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
case ar.err != nil:
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
default:
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
}
if !canDerate || s.appliedLimitW <= 0 {
s.converged = true
continue
}
s.hi = s.appliedLimitW
if s.hi-s.lo <= calibSearchTolerance {
if s.lo > s.minLimitW {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
s.appliedLimitW = s.lo
s.calib.AppliedPowerLimitW = float64(s.lo)
s.calib.Derated = s.lo < s.originalLimitW
// Summary was captured when we last verified stability at s.lo,
// so the result is valid — mark as completed even though we
// converged from the failure path (tried higher, failed, fell back).
s.calib.Completed = true
}
} else {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
}
s.converged = true
continue
}
next := roundTo5W((s.lo + s.hi) / 2)
if next <= s.lo {
next = s.lo + calibSearchTolerance
}
if next >= s.hi {
next = (s.lo + s.hi) / 2
}
if next < s.minLimitW {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
s.converged = true
continue
}
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
s.converged = true
continue
}
s.appliedLimitW = next
s.calib.AppliedPowerLimitW = float64(next)
s.calib.Derated = next < s.originalLimitW
s.info.PowerLimitW = float64(next)
infoByIndex[s.idx] = s.info
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
}
}
for _, s := range states {
if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
results[s.idx] = s.calib
}
}
return results, restore
}
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
// meaning nv-hostengine still holds the diagnostic slot from a prior run.
func isDCGMResourceBusy(err error) bool {
var exitErr *exec.ExitError
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
}
// roundTo5W rounds w to the nearest 5 W boundary.
func roundTo5W(w int) int {
return ((w + 2) / 5) * 5
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return 300
case NvidiaBenchmarkProfileOvernight:
return 600
default:
return 120
}
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
for k, v := range src {
out[k] = v
}
return out
}
func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
var b strings.Builder
b.WriteString("# Bee Bench Power Report\n\n")
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
}
b.WriteString("\n")
// Server power comparison table.
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n")
b.WriteString("| Metric | Value |\n")
b.WriteString("|--------|-------|\n")
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
if sp.Available {
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | %.0f W |\n", sp.DeltaW)
ratio := sp.ReportingRatio
ratioNote := ""
switch {
case ratio >= 0.9:
ratioNote = "✓ GPU telemetry matches server power"
case ratio >= 0.75:
ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
} else {
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note)
}
b.WriteString("\n")
}
if len(result.Findings) > 0 {
b.WriteString("## Summary\n\n")
for _, finding := range result.Findings {
fmt.Fprintf(&b, "- %s\n", finding)
}
b.WriteString("\n")
}
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString("## Recommended Slot Order\n\n")
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
}
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
for _, step := range result.RampSteps {
derated := "-"
if step.Derated {
derated = "⚠ yes"
}
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
}
b.WriteString("\n")
}
b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
for _, gpu := range result.GPUs {
stableLimit := "-"
if gpu.StablePowerLimitW > 0 {
if gpu.Derated {
stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
} else {
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
}
}
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
}
b.WriteString("\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
}
return b.String()
}
func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "platform_max_tdp_w=%.0f\n", result.PlatformMaxTDPW)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
if len(result.RecommendedSlotOrder) > 0 {
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
}
for _, step := range result.RampSteps {
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
}
for _, gpu := range result.GPUs {
if gpu.StablePowerLimitW > 0 {
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
}
}
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
}
return b.String()
}
func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-bench/power"
}
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs selected")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "power-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
return "", infoErr
}
hostname, _ := os.Hostname()
result := NvidiaPowerBenchResult{
BenchmarkVersion: benchmarkVersion,
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkProfile: opts.Profile,
SelectedGPUIndices: append([]int(nil), selected...),
OverallStatus: "OK",
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
// Sample IPMI idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
var allRestoreActions []benchmarkRestoreAction
for _, idx := range selected {
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
_ = os.MkdirAll(singleDir, 0755)
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
allRestoreActions = append(allRestoreActions, restore...)
if r, ok := c[idx]; ok {
calibByIndex[idx] = r
}
}
defer func() {
for i := len(allRestoreActions) - 1; i >= 0; i-- {
allRestoreActions[i].fn()
}
}()
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
for _, idx := range selected {
info := infoByIndex[idx]
calib := calibByIndex[idx]
status := "OK"
if !calib.Completed {
status = "FAILED"
result.OverallStatus = "PARTIAL"
} else if calib.Derated {
status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
gpus = append(gpus, NvidiaPowerBenchGPU{
Index: idx,
Name: info.Name,
BusID: info.BusID,
DefaultPowerLimitW: info.DefaultPowerLimitW,
AppliedPowerLimitW: calib.AppliedPowerLimitW,
MaxObservedPowerW: calib.Summary.P95PowerW,
MaxObservedTempC: calib.Summary.P95TempC,
CalibrationAttempts: calib.Attempts,
Derated: calib.Derated,
Status: status,
Notes: append([]string(nil), calib.Notes...),
CoolingWarning: calib.CoolingWarning,
})
}
sort.Slice(gpus, func(i, j int) bool {
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
}
if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
}
if gpus[i].Derated != gpus[j].Derated {
return !gpus[i].Derated
}
return gpus[i].Index < gpus[j].Index
})
result.GPUs = gpus
result.RecommendedSlotOrder = make([]int, 0, len(gpus))
for _, gpu := range gpus {
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
}
if len(result.RecommendedSlotOrder) > 0 {
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
}
for _, gpu := range gpus {
if gpu.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
}
if gpu.CoolingWarning != "" {
result.Findings = append(result.Findings, fmt.Sprintf(
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
gpu.Index, gpu.CoolingWarning,
))
}
}
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
for _, gpu := range gpus {
singleByIndex[gpu.Index] = gpu
}
// Phase 2: cumulative thermal ramp.
// Each step introduces one new GPU into an environment where all previously
// calibrated GPUs are already running at their fixed stable limits. The new
// GPU's stable TDP is searched via binary search (targeted_power) under real
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
// subsequent steps. This ensures each GPU's limit reflects actual sustained
// power in the final full-system thermal state.
//
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
// server-side loaded power while GPUs are under stress. The goroutine is
// cancelled as soon as Phase 2 finishes, and the average is used to compare
// against PlatformMaxTDPW (GPU-reported stable limits sum).
var serverLoadedW float64
var serverLoadedOK bool
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
ipmiPhase2Done := make(chan float64, 1)
go func() {
defer close(ipmiPhase2Done)
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
ipmiPhase2Done <- w
}
}()
// Step 1: reuse single-card calibration result directly.
if len(result.RecommendedSlotOrder) > 0 {
firstIdx := result.RecommendedSlotOrder[0]
firstCalib := calibByIndex[firstIdx]
stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
ramp := NvidiaPowerBenchStep{
StepIndex: 1,
GPUIndices: []int{firstIdx},
NewGPUIndex: firstIdx,
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
TotalObservedPowerW: firstCalib.Summary.P95PowerW,
AvgObservedPowerW: firstCalib.Summary.P95PowerW,
Derated: firstCalib.Derated,
Status: "OK",
}
if !firstCalib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
result.OverallStatus = "PARTIAL"
} else if firstCalib.Derated {
ramp.Status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step 1 (GPU %d) required derating to %.0f W.", firstIdx, firstCalib.AppliedPowerLimitW))
}
result.RampSteps = append(result.RampSteps, ramp)
logFunc(fmt.Sprintf("power ramp: step 1/%d — reused single-card calibration for GPU %d, stable limit %.0f W",
len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
}
// Steps 2..N: each step revalidates every already-active GPU under the new
// cumulative thermal environment and also calibrates the newly introduced
// GPU. Previously found limits are used only as seeds for the search.
for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
step := stepNum + 1
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
newGPUIdx := result.RecommendedSlotOrder[stepNum]
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
// Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration. For the newly introduced GPU,
// seed from its single-card calibration so we do not restart from the
// default TDP when a prior derated limit is already known.
seedForStep := make(map[int]int, len(subset))
for _, idx := range subset {
if lim, ok := stableLimits[idx]; ok && lim > 0 {
seedForStep[idx] = lim
continue
}
if base, ok := calibByIndex[idx]; ok {
lim := int(math.Round(base.AppliedPowerLimitW))
if lim > 0 {
seedForStep[idx] = lim
}
}
}
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
// Accumulate restore actions; they all run in the outer defer.
allRestoreActions = append(allRestoreActions, stepRestore...)
ramp := NvidiaPowerBenchStep{
StepIndex: step,
GPUIndices: subset,
NewGPUIndex: newGPUIdx,
Status: "OK",
}
// Total observed power = sum of p95 across all GPUs in this step.
for _, idx := range subset {
if c, ok := stepCalib[idx]; ok {
ramp.TotalObservedPowerW += c.Summary.P95PowerW
}
}
if len(subset) > 0 {
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
}
for _, idx := range subset {
c, ok := stepCalib[idx]
if !ok || !c.Completed {
fallback := 0
if lim, ok := stableLimits[idx]; ok && lim > 0 {
fallback = lim
} else if fb, ok := calibByIndex[idx]; ok {
fallback = int(math.Round(fb.AppliedPowerLimitW))
}
if fallback > 0 {
stableLimits[idx] = fallback
}
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes,
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
result.OverallStatus = "PARTIAL"
continue
}
prevLimit, hadPrev := stableLimits[idx]
newLimit := int(math.Round(c.AppliedPowerLimitW))
stableLimits[idx] = newLimit
if idx == newGPUIdx {
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
ramp.Derated = c.Derated
}
if c.Derated {
ramp.Status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
}
if hadPrev && newLimit < prevLimit {
ramp.Notes = append(ramp.Notes,
fmt.Sprintf("GPU %d was re-derated from %d W to %d W under combined thermal load.", idx, prevLimit, newLimit))
}
}
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed && c.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
}
result.RampSteps = append(result.RampSteps, ramp)
}
// Stop IPMI Phase 2 sampling and collect result.
ipmiPhase2Cancel()
if w, ok := <-ipmiPhase2Done; ok {
serverLoadedW = w
serverLoadedOK = true
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
}
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
for i := range result.GPUs {
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
result.GPUs[i].StablePowerLimitW = float64(lim)
}
if result.GPUs[i].StablePowerLimitW > 0 && result.GPUs[i].AppliedPowerLimitW > 0 &&
result.GPUs[i].StablePowerLimitW < result.GPUs[i].AppliedPowerLimitW {
result.GPUs[i].Derated = true
result.Findings = append(result.Findings, fmt.Sprintf(
"GPU %d required additional derating from %.0f W (single-card) to %.0f W under full-system thermal load.",
result.GPUs[i].Index, result.GPUs[i].AppliedPowerLimitW, result.GPUs[i].StablePowerLimitW,
))
}
}
// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
// budget of this server with all GPUs running simultaneously without throttling.
for _, lim := range stableLimits {
result.PlatformMaxTDPW += float64(lim)
}
// Characterize server power from IPMI idle/loaded samples.
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
// ReportingRatio = IPMI_delta / GPU_reported_sum:
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
_ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal power result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return "", fmt.Errorf("write result.json: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
return "", fmt.Errorf("write summary.txt: %w", err)
}
return runDir, nil
}