Phase 1 now calibrates each GPU individually (sequentially) so that PowerRealizationPct reflects real degradation from neighbour thermals and shared power rails. Previously the baseline came from an all-GPU-together run, making realization always ≈100% at the final ramp step. Ramp step 1 reuses single-card calibration results (no extra run); steps 2..N run targeted_power on the growing GPU subset with derating active. Remove OccupiedSlots/OccupiedSlotsNote fields and occupiedSlots() helper — they were compensation for the old all-GPU calibration approach. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
3113 lines
108 KiB
Go
3113 lines
108 KiB
Go
package platform
|
||
|
||
import (
|
||
"context"
|
||
"encoding/csv"
|
||
"encoding/json"
|
||
"errors"
|
||
"fmt"
|
||
"math"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"regexp"
|
||
"sort"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
const benchmarkVersion = "2"
|
||
|
||
type benchmarkProfileSpec struct {
|
||
Name string
|
||
BaselineSec int
|
||
WarmupSec int
|
||
SteadySec int
|
||
NCCLSec int
|
||
CooldownSec int
|
||
}
|
||
|
||
type benchmarkGPUInfo struct {
|
||
Index int
|
||
UUID string
|
||
Name string
|
||
BusID string
|
||
VBIOS string
|
||
PowerLimitW float64
|
||
DefaultPowerLimitW float64
|
||
MaxGraphicsClockMHz float64
|
||
MaxMemoryClockMHz float64
|
||
BaseGraphicsClockMHz float64
|
||
MultiprocessorCount int
|
||
}
|
||
|
||
type benchmarkPowerCalibrationResult struct {
|
||
Summary BenchmarkTelemetrySummary
|
||
AppliedPowerLimitW float64
|
||
Attempts int
|
||
Derated bool
|
||
Completed bool
|
||
Notes []string
|
||
// CoolingWarning is set when the GPU throttled thermally with a clock drop
|
||
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
||
// cooling system may not be correctly configured for full GPU load.
|
||
CoolingWarning string
|
||
}
|
||
|
||
type benchmarkBurnProfile struct {
|
||
name string
|
||
category string
|
||
supported bool
|
||
lanes int
|
||
m uint64
|
||
n uint64
|
||
k uint64
|
||
iterations uint64
|
||
notes string
|
||
}
|
||
|
||
type benchmarkBurnParseResult struct {
|
||
Device string
|
||
ComputeCapability string
|
||
Backend string
|
||
DurationSec int
|
||
Profiles []BenchmarkPrecisionResult
|
||
Fallback bool
|
||
}
|
||
|
||
type benchmarkRestoreAction struct {
|
||
name string
|
||
fn func()
|
||
}
|
||
|
||
var (
|
||
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||
)
|
||
|
||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||
// steady-state windows before the combined steady pass. Order is from lowest
|
||
// to highest power draw so thermal ramp-up is gradual.
|
||
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
|
||
|
||
func computeCapabilityCode(raw string) int {
|
||
raw = strings.TrimSpace(raw)
|
||
if raw == "" {
|
||
return 0
|
||
}
|
||
parts := strings.SplitN(raw, ".", 2)
|
||
major, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||
minor := 0
|
||
if len(parts) > 1 {
|
||
minor, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
|
||
}
|
||
return major*10 + minor
|
||
}
|
||
|
||
func benchmarkSupportedPrecisions(computeCapability string) []string {
|
||
cc := computeCapabilityCode(computeCapability)
|
||
out := make([]string, 0, len(benchmarkPrecisionPhases))
|
||
for _, prec := range benchmarkPrecisionPhases {
|
||
if prec == "fp4" && cc > 0 && cc < 100 {
|
||
continue
|
||
}
|
||
out = append(out, prec)
|
||
}
|
||
return out
|
||
}
|
||
|
||
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
|
||
if len(precisions) == 0 {
|
||
precisions = append([]string(nil), benchmarkPrecisionPhases...)
|
||
}
|
||
switch spec.Name {
|
||
case NvidiaBenchmarkProfileStandard:
|
||
basePhaseSec = 60
|
||
mixedPhaseSec = 300
|
||
case NvidiaBenchmarkProfileStability:
|
||
basePhaseSec = 300
|
||
mixedPhaseSec = 3600
|
||
case NvidiaBenchmarkProfileOvernight:
|
||
basePhaseSec = 3600
|
||
mixedPhaseSec = 14400
|
||
default:
|
||
totalWeight := len(precisions) + 5
|
||
if totalWeight <= 0 {
|
||
return nil, nil, 0, 0
|
||
}
|
||
basePhaseSec = spec.SteadySec / totalWeight
|
||
if basePhaseSec <= 0 {
|
||
basePhaseSec = 1
|
||
}
|
||
mixedPhaseSec = basePhaseSec * 5
|
||
}
|
||
planLabels = make([]string, 0, len(precisions)+1)
|
||
planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1)
|
||
for _, prec := range precisions {
|
||
planLabels = append(planLabels, prec)
|
||
planPhases = append(planPhases, benchmarkPlannedPhase{
|
||
PlanLabel: prec,
|
||
MetricStage: metricStage(prec),
|
||
DurationSec: basePhaseSec,
|
||
})
|
||
}
|
||
planLabels = append(planLabels, "mixed")
|
||
planPhases = append(planPhases, benchmarkPlannedPhase{
|
||
PlanLabel: "mixed",
|
||
MetricStage: metricStage("mixed"),
|
||
DurationSec: mixedPhaseSec,
|
||
})
|
||
return planLabels, planPhases, basePhaseSec, mixedPhaseSec
|
||
}
|
||
|
||
func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
|
||
values := make([]string, 0, len(phases))
|
||
for _, phase := range phases {
|
||
values = append(values, strconv.Itoa(phase.DurationSec))
|
||
}
|
||
return strings.Join(values, ",")
|
||
}
|
||
|
||
func benchmarkPlannedPhaseStatus(raw []byte) (string, string) {
|
||
text := strings.ToLower(strings.TrimSpace(string(raw)))
|
||
switch {
|
||
case text == "":
|
||
return "FAILED", "phase produced no output"
|
||
case strings.Contains(text, "phase_error="):
|
||
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") {
|
||
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
|
||
}
|
||
return "FAILED", "precision phase failed"
|
||
case strings.Contains(text, "status=failed"):
|
||
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") {
|
||
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
|
||
}
|
||
return "FAILED", "precision phase failed"
|
||
default:
|
||
return "OK", ""
|
||
}
|
||
}
|
||
|
||
func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string {
|
||
diff := diffThrottleCounters(before, after)
|
||
switch {
|
||
case diff.HWThermalSlowdownUS > 0:
|
||
return "hw_thermal"
|
||
case diff.SWThermalSlowdownUS > 0:
|
||
return "sw_thermal"
|
||
case diff.HWPowerBrakeSlowdownUS > 0:
|
||
return "hw_power_brake"
|
||
default:
|
||
return ""
|
||
}
|
||
}
|
||
|
||
func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error {
|
||
if powerLimitW <= 0 {
|
||
return fmt.Errorf("invalid power limit %d", powerLimitW)
|
||
}
|
||
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{
|
||
"nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW),
|
||
}, nil, nil)
|
||
if err != nil {
|
||
return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out)))
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||
if ctx == nil {
|
||
ctx = context.Background()
|
||
}
|
||
if logFunc == nil {
|
||
logFunc = func(string) {}
|
||
}
|
||
if strings.TrimSpace(baseDir) == "" {
|
||
baseDir = "/var/log/bee-bench/perf"
|
||
}
|
||
spec := resolveBenchmarkProfile(opts.Profile)
|
||
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
|
||
|
||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
if len(selected) == 0 {
|
||
return "", fmt.Errorf("no NVIDIA GPUs selected")
|
||
}
|
||
|
||
ts := time.Now().UTC().Format("20060102-150405")
|
||
runDir := filepath.Join(baseDir, "perf-"+ts)
|
||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||
}
|
||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||
|
||
hostname, _ := os.Hostname()
|
||
result := NvidiaBenchmarkResult{
|
||
BenchmarkVersion: benchmarkVersion,
|
||
GeneratedAt: time.Now().UTC(),
|
||
Hostname: hostname,
|
||
ServerModel: readServerModel(),
|
||
BenchmarkProfile: spec.Name,
|
||
ParallelGPUs: opts.ParallelGPUs,
|
||
RampStep: opts.RampStep,
|
||
RampTotal: opts.RampTotal,
|
||
RampRunID: opts.RampRunID,
|
||
SelectedGPUIndices: append([]int(nil), selected...),
|
||
HostConfig: readBenchmarkHostConfig(),
|
||
Normalization: BenchmarkNormalization{
|
||
Status: "full",
|
||
},
|
||
}
|
||
|
||
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
||
var metricRows []GPUMetricRow
|
||
metricTimelineSec := 0.0
|
||
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
|
||
|
||
// Server power characterization state — populated during per-GPU phases.
|
||
var serverIdleW, serverLoadedWSum float64
|
||
var serverIdleOK, serverLoadedOK bool
|
||
var serverLoadedSamples int
|
||
|
||
// Run nvidia-smi -q first: used both for the log file and as a fallback
|
||
// source of max clock values when CSV clock fields are unsupported.
|
||
var nvsmiQOut []byte
|
||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||
nvsmiQOut = out
|
||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||
}
|
||
|
||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||
if infoErr != nil {
|
||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||
result.Normalization.Status = "partial"
|
||
}
|
||
// Enrich with max clocks from verbose output — covers GPUs where
|
||
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
|
||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
|
||
|
||
activeApps, err := queryActiveComputeApps(selected)
|
||
if err == nil && len(activeApps) > 0 {
|
||
result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
|
||
result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
|
||
result.Normalization.Status = "partial"
|
||
}
|
||
|
||
restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
|
||
defer func() {
|
||
for i := len(restoreActions) - 1; i >= 0; i-- {
|
||
restoreActions[i].fn()
|
||
}
|
||
}()
|
||
|
||
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
|
||
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
|
||
calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
||
restoreActions = append(restoreActions, powerRestoreActions...)
|
||
for _, idx := range selected {
|
||
if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 {
|
||
result.Warnings = append(result.Warnings, fmt.Sprintf(
|
||
"GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.",
|
||
idx, calib.AppliedPowerLimitW,
|
||
))
|
||
}
|
||
}
|
||
|
||
// Start background CPU load sampler — samples every 10s during GPU phases.
|
||
cpuStopCh := make(chan struct{})
|
||
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
|
||
|
||
if opts.ParallelGPUs {
|
||
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog)
|
||
} else {
|
||
|
||
for _, idx := range selected {
|
||
gpuResult := BenchmarkGPUResult{
|
||
Index: idx,
|
||
Status: "FAILED",
|
||
}
|
||
if info, ok := infoByIndex[idx]; ok {
|
||
gpuResult.UUID = info.UUID
|
||
gpuResult.Name = info.Name
|
||
gpuResult.BusID = info.BusID
|
||
gpuResult.VBIOS = info.VBIOS
|
||
gpuResult.PowerLimitW = info.PowerLimitW
|
||
gpuResult.MultiprocessorCount = info.MultiprocessorCount
|
||
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||
}
|
||
if calib, ok := calibByIndex[idx]; ok {
|
||
gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW
|
||
gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC
|
||
gpuResult.PowerCalibrationTries = calib.Attempts
|
||
gpuResult.PowerLimitDerated = calib.Derated
|
||
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
|
||
if calib.CoolingWarning != "" {
|
||
gpuResult.CoolingWarning = calib.CoolingWarning
|
||
}
|
||
}
|
||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||
}
|
||
|
||
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
|
||
if err != nil && err != context.Canceled {
|
||
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
|
||
}
|
||
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
||
appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec))
|
||
|
||
// Sample server idle power once (first GPU only — server state is global).
|
||
if !serverIdleOK {
|
||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||
serverIdleW = w
|
||
serverIdleOK = true
|
||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||
}
|
||
}
|
||
|
||
warmupCmd := []string{
|
||
"bee-gpu-burn",
|
||
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||
"--devices", strconv.Itoa(idx),
|
||
}
|
||
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
|
||
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
|
||
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec))
|
||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
|
||
if warmupErr != nil {
|
||
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
|
||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
||
continue
|
||
}
|
||
warmupParse := parseBenchmarkBurnLog(string(warmupOut))
|
||
if gpuResult.ComputeCapability == "" {
|
||
gpuResult.ComputeCapability = warmupParse.ComputeCapability
|
||
}
|
||
|
||
// Run synthetic precision phases and the combined steady phase as one
|
||
// uninterrupted command so the GPU stays hot between windows.
|
||
eccBase, _ := queryECCCounters(idx)
|
||
supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability)
|
||
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
|
||
if label == "mixed" {
|
||
return fmt.Sprintf("gpu-%d-steady", idx)
|
||
}
|
||
return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
|
||
})
|
||
planCmd := []string{
|
||
"bee-gpu-burn",
|
||
"--seconds", strconv.Itoa(basePhaseSec),
|
||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||
"--devices", strconv.Itoa(idx),
|
||
"--precision-plan", strings.Join(planLabels, ","),
|
||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||
}
|
||
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
||
for _, phaseSpec := range planPhases {
|
||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
|
||
}
|
||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
|
||
}
|
||
for _, prec := range supportedPrecisions {
|
||
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
|
||
phaseRows := phaseRowsByStage[stageName]
|
||
phase := BenchmarkPrecisionSteadyPhase{
|
||
Precision: prec,
|
||
Status: "OK",
|
||
Steady: summarizeBenchmarkTelemetry(phaseRows),
|
||
}
|
||
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
|
||
phase.Status = status
|
||
phase.Notes = note
|
||
gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status)
|
||
}
|
||
for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
|
||
if p.Supported {
|
||
phase.TeraOpsPerSec += p.TeraOpsPerSec
|
||
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
|
||
}
|
||
}
|
||
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
|
||
}
|
||
|
||
beforeThrottle, _ := queryThrottleCounters(idx)
|
||
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
|
||
|
||
// Sample server power via IPMI in parallel with the steady phase.
|
||
// We collect readings every 5s and average them.
|
||
ipmiStopCh := make(chan struct{})
|
||
ipmiResultCh := make(chan float64, 1)
|
||
go func() {
|
||
defer close(ipmiResultCh)
|
||
var samples []float64
|
||
ticker := time.NewTicker(5 * time.Second)
|
||
defer ticker.Stop()
|
||
// First sample after a short warmup delay.
|
||
select {
|
||
case <-ipmiStopCh:
|
||
return
|
||
case <-time.After(15 * time.Second):
|
||
}
|
||
for {
|
||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||
samples = append(samples, w)
|
||
}
|
||
select {
|
||
case <-ipmiStopCh:
|
||
if len(samples) > 0 {
|
||
var sum float64
|
||
for _, w := range samples {
|
||
sum += w
|
||
}
|
||
ipmiResultCh <- sum / float64(len(samples))
|
||
}
|
||
return
|
||
case <-ticker.C:
|
||
}
|
||
}
|
||
}()
|
||
|
||
close(ipmiStopCh)
|
||
if loadedW, ok := <-ipmiResultCh; ok {
|
||
serverLoadedWSum += loadedW
|
||
serverLoadedSamples++
|
||
serverLoadedOK = true
|
||
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
||
}
|
||
afterThrottle, _ := queryThrottleCounters(idx)
|
||
if planErr != nil {
|
||
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
|
||
}
|
||
|
||
steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
|
||
parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
|
||
gpuResult.ComputeCapability = parseResult.ComputeCapability
|
||
gpuResult.Backend = parseResult.Backend
|
||
gpuResult.PrecisionResults = parseResult.Profiles
|
||
if parseResult.Fallback {
|
||
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
||
}
|
||
|
||
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
|
||
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
|
||
if eccFinal, err := queryECCCounters(idx); err == nil {
|
||
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
|
||
}
|
||
|
||
if spec.CooldownSec > 0 {
|
||
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
|
||
if err != nil && err != context.Canceled {
|
||
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
|
||
}
|
||
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
|
||
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
|
||
}
|
||
|
||
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
||
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
||
if planErr != nil {
|
||
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||
} else if len(gpuResult.PrecisionFailures) > 0 {
|
||
gpuResult.Status = "PARTIAL"
|
||
} else if parseResult.Fallback {
|
||
gpuResult.Status = "PARTIAL"
|
||
} else {
|
||
gpuResult.Status = "OK"
|
||
}
|
||
|
||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
||
}
|
||
|
||
} // end sequential path
|
||
|
||
if len(selected) > 1 && opts.RunNCCL {
|
||
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
||
if result.Interconnect != nil && result.Interconnect.Supported {
|
||
for i := range result.GPUs {
|
||
result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
|
||
result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Stop CPU load sampler and attach results.
|
||
close(cpuStopCh)
|
||
if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
|
||
result.CPULoad = summarizeCPULoad(cpuSamples)
|
||
if result.CPULoad != nil && result.CPULoad.Status != "ok" {
|
||
logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
|
||
result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
|
||
}
|
||
}
|
||
|
||
// Compute server power characterization from accumulated IPMI samples.
|
||
var gpuReportedSumW float64
|
||
for _, gpu := range result.GPUs {
|
||
gpuReportedSumW += gpu.Steady.AvgPowerW
|
||
}
|
||
var serverLoadedW float64
|
||
if serverLoadedSamples > 0 {
|
||
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||
}
|
||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
||
result.Cooling = summarizeBenchmarkCooling(metricRows)
|
||
|
||
// Apply server-power penalty when IPMI reports the server delta is much
|
||
// lower than GPU-reported sum: GPU power telemetry is over-stated, making
|
||
// CalibratedPeakPowerW and PowerSustainScore unreliable.
|
||
// Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0.
|
||
if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 {
|
||
factor := sp.ReportingRatio / 0.75
|
||
for i := range result.GPUs {
|
||
result.GPUs[i].Scores.CompositeScore *= factor
|
||
result.GPUs[i].Notes = append(result.GPUs[i].Notes,
|
||
fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%",
|
||
sp.ReportingRatio, factor*100))
|
||
}
|
||
}
|
||
|
||
result.Findings = buildBenchmarkFindings(result)
|
||
result.OverallStatus = benchmarkOverallStatus(result)
|
||
writeBenchmarkMetricsFiles(runDir, metricRows)
|
||
|
||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||
if err != nil {
|
||
return "", fmt.Errorf("marshal benchmark result: %w", err)
|
||
}
|
||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||
return "", fmt.Errorf("write result.json: %w", err)
|
||
}
|
||
|
||
report := renderBenchmarkReportWithCharts(result)
|
||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
|
||
return "", fmt.Errorf("write report.md: %w", err)
|
||
}
|
||
|
||
summary := renderBenchmarkSummary(result)
|
||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
|
||
return "", fmt.Errorf("write summary.txt: %w", err)
|
||
}
|
||
|
||
return runDir, nil
|
||
}
|
||
|
||
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
|
||
switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
|
||
case NvidiaBenchmarkProfileStability:
|
||
opts.Profile = NvidiaBenchmarkProfileStability
|
||
case NvidiaBenchmarkProfileOvernight:
|
||
opts.Profile = NvidiaBenchmarkProfileOvernight
|
||
default:
|
||
opts.Profile = NvidiaBenchmarkProfileStandard
|
||
}
|
||
if opts.SizeMB < 0 {
|
||
opts.SizeMB = 0
|
||
}
|
||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||
return opts
|
||
}
|
||
|
||
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||
case NvidiaBenchmarkProfileStability:
|
||
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
|
||
case NvidiaBenchmarkProfileOvernight:
|
||
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
|
||
default:
|
||
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
|
||
}
|
||
}
|
||
|
||
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
|
||
// Fields are tried in order; the first successful query wins. Extended fields
|
||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||
// all driver versions, so we fall back to the base set if the full query fails.
|
||
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
||
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
|
||
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
|
||
var benchmarkGPUInfoQueries = []struct {
|
||
fields string
|
||
extended bool // whether this query includes optional extended fields
|
||
minimal bool // clock fields omitted; max clocks must be filled separately
|
||
}{
|
||
{
|
||
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||
extended: true,
|
||
},
|
||
{
|
||
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
|
||
extended: false,
|
||
},
|
||
{
|
||
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit",
|
||
minimal: true,
|
||
},
|
||
}
|
||
|
||
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
|
||
// any GPU in infoByIndex where those values are still zero. It parses the
|
||
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
|
||
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
|
||
// return exit status 2 but the verbose query works fine.
|
||
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
||
return
|
||
}
|
||
|
||
// Build bus_id → index map for matching verbose sections to GPU indices.
|
||
busToBenchIdx := make(map[string]int, len(infoByIndex))
|
||
for idx, info := range infoByIndex {
|
||
if info.BusID != "" {
|
||
// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
|
||
// while --query-gpu returns the same format; normalise to lower.
|
||
busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
|
||
}
|
||
}
|
||
|
||
// Split the verbose output into per-GPU sections on "^GPU " lines.
|
||
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
|
||
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
|
||
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||
|
||
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
|
||
for i, loc := range sectionStarts {
|
||
busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
|
||
benchIdx, ok := busToBenchIdx[busID]
|
||
if !ok {
|
||
// Bus IDs from verbose output may have a different domain prefix;
|
||
// try suffix match on the slot portion (XX:XX.X).
|
||
for k, v := range busToBenchIdx {
|
||
if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
|
||
benchIdx = v
|
||
ok = true
|
||
break
|
||
}
|
||
}
|
||
}
|
||
if !ok {
|
||
continue
|
||
}
|
||
|
||
end := len(nvsmiQ)
|
||
if i+1 < len(sectionStarts) {
|
||
end = sectionStarts[i+1][0]
|
||
}
|
||
section := nvsmiQ[loc[0]:end]
|
||
|
||
info := infoByIndex[benchIdx]
|
||
|
||
if info.MaxGraphicsClockMHz == 0 {
|
||
if m := maxGfxRe.FindSubmatch(section); m != nil {
|
||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
|
||
info.MaxGraphicsClockMHz = v
|
||
}
|
||
}
|
||
}
|
||
if info.MaxMemoryClockMHz == 0 {
|
||
if m := maxMemRe.FindSubmatch(section); m != nil {
|
||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
|
||
info.MaxMemoryClockMHz = v
|
||
}
|
||
}
|
||
}
|
||
if info.DefaultPowerLimitW == 0 {
|
||
if m := defaultPwrRe.FindSubmatch(section); m != nil {
|
||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||
info.DefaultPowerLimitW = v
|
||
}
|
||
}
|
||
}
|
||
if info.PowerLimitW == 0 {
|
||
if m := currentPwrRe.FindSubmatch(section); m != nil {
|
||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||
info.PowerLimitW = v
|
||
}
|
||
}
|
||
}
|
||
if info.MultiprocessorCount == 0 {
|
||
if m := smCountRe.FindSubmatch(section); m != nil {
|
||
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
||
info.MultiprocessorCount = v
|
||
}
|
||
}
|
||
}
|
||
infoByIndex[benchIdx] = info
|
||
}
|
||
}
|
||
|
||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||
var lastErr error
|
||
for _, q := range benchmarkGPUInfoQueries {
|
||
args := []string{
|
||
"--query-gpu=" + q.fields,
|
||
"--format=csv,noheader,nounits",
|
||
}
|
||
if len(gpuIndices) > 0 {
|
||
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
||
}
|
||
out, err := satExecCommand("nvidia-smi", args...).Output()
|
||
if err != nil {
|
||
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
|
||
continue
|
||
}
|
||
|
||
r := csv.NewReader(strings.NewReader(string(out)))
|
||
r.TrimLeadingSpace = true
|
||
r.FieldsPerRecord = -1
|
||
rows, err := r.ReadAll()
|
||
if err != nil {
|
||
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
||
continue
|
||
}
|
||
|
||
minFields := 6
|
||
if !q.minimal {
|
||
minFields = 9
|
||
}
|
||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||
for _, row := range rows {
|
||
if len(row) < minFields {
|
||
continue
|
||
}
|
||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||
if err != nil {
|
||
continue
|
||
}
|
||
info := benchmarkGPUInfo{
|
||
Index: idx,
|
||
UUID: strings.TrimSpace(row[1]),
|
||
Name: strings.TrimSpace(row[2]),
|
||
BusID: strings.TrimSpace(row[3]),
|
||
VBIOS: strings.TrimSpace(row[4]),
|
||
PowerLimitW: parseBenchmarkFloat(row[5]),
|
||
}
|
||
if !q.minimal {
|
||
info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
|
||
info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
|
||
if len(row) >= 9 {
|
||
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||
}
|
||
if q.extended {
|
||
if len(row) >= 10 {
|
||
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||
}
|
||
if len(row) >= 11 {
|
||
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||
}
|
||
}
|
||
}
|
||
infoByIndex[idx] = info
|
||
}
|
||
return infoByIndex, nil
|
||
}
|
||
return nil, lastErr
|
||
}
|
||
|
||
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
||
if os.Geteuid() != 0 {
|
||
result.Normalization.Status = "partial"
|
||
result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
|
||
for _, idx := range gpuIndices {
|
||
result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
|
||
Index: idx,
|
||
Notes: []string{"normalization skipped: root privileges are required"},
|
||
})
|
||
}
|
||
return nil
|
||
}
|
||
|
||
var restore []benchmarkRestoreAction
|
||
for _, idx := range gpuIndices {
|
||
rec := BenchmarkNormalizationGPU{Index: idx}
|
||
if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
|
||
rec.PersistenceMode = "failed"
|
||
rec.Notes = append(rec.Notes, "failed to enable persistence mode")
|
||
result.Normalization.Status = "partial"
|
||
} else {
|
||
rec.PersistenceMode = "applied"
|
||
}
|
||
|
||
if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
|
||
target := int(math.Round(info.MaxGraphicsClockMHz))
|
||
if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
|
||
rec.GPUClockLockStatus = "failed"
|
||
rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
|
||
result.Normalization.Status = "partial"
|
||
} else {
|
||
rec.GPUClockLockStatus = "applied"
|
||
rec.GPUClockLockMHz = float64(target)
|
||
idxCopy := idx
|
||
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
|
||
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
||
}})
|
||
}
|
||
} else {
|
||
rec.GPUClockLockStatus = "skipped"
|
||
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
|
||
result.Normalization.Status = "partial"
|
||
}
|
||
|
||
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
||
target := int(math.Round(info.MaxMemoryClockMHz))
|
||
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
|
||
switch {
|
||
case err == nil:
|
||
rec.MemoryClockLockStatus = "applied"
|
||
rec.MemoryClockLockMHz = float64(target)
|
||
idxCopy := idx
|
||
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
|
||
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
|
||
}})
|
||
case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
|
||
rec.MemoryClockLockStatus = "unsupported"
|
||
rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
|
||
result.Normalization.Status = "partial"
|
||
default:
|
||
rec.MemoryClockLockStatus = "failed"
|
||
rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
|
||
result.Normalization.Status = "partial"
|
||
}
|
||
}
|
||
|
||
result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
|
||
}
|
||
return restore
|
||
}
|
||
|
||
func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
|
||
if durationSec <= 0 {
|
||
return nil, nil
|
||
}
|
||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||
var rows []GPUMetricRow
|
||
start := time.Now()
|
||
for {
|
||
if ctx.Err() != nil {
|
||
return rows, ctx.Err()
|
||
}
|
||
samples, err := sampleBenchmarkTelemetry(gpuIndices)
|
||
if err == nil {
|
||
elapsed := time.Since(start).Seconds()
|
||
for i := range samples {
|
||
samples[i].ElapsedSec = elapsed
|
||
}
|
||
rows = append(rows, samples...)
|
||
}
|
||
if time.Now().After(deadline) {
|
||
break
|
||
}
|
||
select {
|
||
case <-ctx.Done():
|
||
return rows, ctx.Err()
|
||
case <-time.After(time.Second):
|
||
}
|
||
}
|
||
return rows, nil
|
||
}
|
||
|
||
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
|
||
stopCh := make(chan struct{})
|
||
doneCh := make(chan struct{})
|
||
var metricRows []GPUMetricRow
|
||
start := time.Now()
|
||
|
||
go func() {
|
||
defer close(doneCh)
|
||
ticker := time.NewTicker(time.Second)
|
||
defer ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-stopCh:
|
||
return
|
||
case <-ticker.C:
|
||
samples, err := sampleBenchmarkTelemetry(gpuIndices)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
elapsed := time.Since(start).Seconds()
|
||
for i := range samples {
|
||
samples[i].ElapsedSec = elapsed
|
||
}
|
||
metricRows = append(metricRows, samples...)
|
||
}
|
||
}
|
||
}()
|
||
|
||
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
||
close(stopCh)
|
||
<-doneCh
|
||
|
||
return out, metricRows, err
|
||
}
|
||
|
||
type benchmarkPlannedPhase struct {
|
||
PlanLabel string
|
||
MetricStage string
|
||
DurationSec int
|
||
}
|
||
|
||
func runBenchmarkPlannedCommandWithMetrics(
|
||
ctx context.Context,
|
||
verboseLog, name string,
|
||
cmd []string,
|
||
env []string,
|
||
gpuIndices []int,
|
||
phases []benchmarkPlannedPhase,
|
||
logFunc func(string),
|
||
) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
|
||
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
|
||
return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
|
||
}
|
||
|
||
func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
|
||
out := make(map[string][]GPUMetricRow, len(phases))
|
||
if len(rows) == 0 || len(phases) == 0 {
|
||
return out
|
||
}
|
||
for _, row := range rows {
|
||
idx := len(phases) - 1
|
||
var elapsed float64
|
||
for i, phase := range phases {
|
||
durationSec := phase.DurationSec
|
||
if durationSec <= 0 {
|
||
durationSec = 1
|
||
}
|
||
elapsed += float64(durationSec)
|
||
if row.ElapsedSec < elapsed {
|
||
idx = i
|
||
break
|
||
}
|
||
}
|
||
out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
|
||
}
|
||
return out
|
||
}
|
||
|
||
func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
|
||
out := make(map[string][]byte)
|
||
var current string
|
||
for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
|
||
trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
|
||
switch {
|
||
case strings.HasPrefix(trimmed, "phase_begin="):
|
||
current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
|
||
case strings.HasPrefix(trimmed, "phase_end="):
|
||
current = ""
|
||
case current != "":
|
||
out[current] = append(out[current], []byte(line+"\n")...)
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
type benchmarkCoolingSample struct {
|
||
AvgFanRPM float64
|
||
AvgFanDutyCyclePct float64
|
||
FanDutyCycleAvailable bool
|
||
}
|
||
|
||
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
|
||
samples, err := sampleGPUMetrics(gpuIndices)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
fanSample := sampleBenchmarkCoolingSample()
|
||
for i := range samples {
|
||
samples[i].FanAvgRPM = fanSample.AvgFanRPM
|
||
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
|
||
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
|
||
}
|
||
return samples, nil
|
||
}
|
||
|
||
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
||
fans, _ := sampleFanSpeeds()
|
||
avgRPM, _, _ := fanRPMStats(fans)
|
||
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
|
||
return benchmarkCoolingSample{
|
||
AvgFanRPM: avgRPM,
|
||
AvgFanDutyCyclePct: dutyPct,
|
||
FanDutyCycleAvailable: dutyAvailable,
|
||
}
|
||
}
|
||
|
||
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow {
|
||
if len(rows) == 0 {
|
||
return nil
|
||
}
|
||
stageEnd := offset + durationSec
|
||
if stageEnd <= offset {
|
||
stageEnd = offset
|
||
for _, row := range rows {
|
||
if row.ElapsedSec+offset > stageEnd {
|
||
stageEnd = row.ElapsedSec + offset
|
||
}
|
||
}
|
||
}
|
||
out := make([]GPUMetricRow, len(rows))
|
||
for i, row := range rows {
|
||
row.Stage = stage
|
||
row.ElapsedSec += offset
|
||
row.StageStartSec = offset
|
||
row.StageEndSec = stageEnd
|
||
out[i] = row
|
||
}
|
||
return out
|
||
}
|
||
|
||
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) {
|
||
annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec)
|
||
*allRows = append(*allRows, annotated...)
|
||
*cursor += durationSec
|
||
}
|
||
|
||
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
|
||
if len(rows) == 0 {
|
||
return
|
||
}
|
||
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows)
|
||
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows)
|
||
}
|
||
|
||
func appendBenchmarkStageLog(path, source, stage string, raw []byte) {
|
||
if path == "" || len(raw) == 0 {
|
||
return
|
||
}
|
||
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
||
if err != nil {
|
||
return
|
||
}
|
||
defer f.Close()
|
||
header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage)
|
||
_, _ = f.WriteString(header)
|
||
if len(raw) > 0 {
|
||
_, _ = f.Write(raw)
|
||
if raw[len(raw)-1] != '\n' {
|
||
_, _ = f.WriteString("\n")
|
||
}
|
||
}
|
||
}
|
||
|
||
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
|
||
result := benchmarkBurnParseResult{}
|
||
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
|
||
profiles := make(map[string]*benchmarkBurnProfile)
|
||
for _, line := range lines {
|
||
line = stripBenchmarkPrefix(strings.TrimSpace(line))
|
||
if line == "" {
|
||
continue
|
||
}
|
||
switch {
|
||
case strings.HasPrefix(line, "device="):
|
||
result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
|
||
case strings.HasPrefix(line, "compute_capability="):
|
||
result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
|
||
case strings.HasPrefix(line, "backend="):
|
||
result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
|
||
result.Fallback = result.Backend == "driver-ptx"
|
||
case strings.HasPrefix(line, "duration_s="):
|
||
result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
|
||
default:
|
||
if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
|
||
profile := ensureBenchmarkProfile(profiles, m[1])
|
||
profile.supported = true
|
||
profile.lanes++
|
||
profile.m, _ = strconv.ParseUint(m[3], 10, 64)
|
||
profile.n, _ = strconv.ParseUint(m[4], 10, 64)
|
||
profile.k, _ = strconv.ParseUint(m[5], 10, 64)
|
||
continue
|
||
}
|
||
if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
|
||
profile := ensureBenchmarkProfile(profiles, m[1])
|
||
profile.supported = false
|
||
profile.notes = strings.TrimSpace(m[2])
|
||
continue
|
||
}
|
||
if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
|
||
profile := ensureBenchmarkProfile(profiles, m[1])
|
||
iters, _ := strconv.ParseUint(m[2], 10, 64)
|
||
profile.iterations += iters
|
||
}
|
||
}
|
||
}
|
||
|
||
keys := make([]string, 0, len(profiles))
|
||
for key := range profiles {
|
||
keys = append(keys, key)
|
||
}
|
||
sort.Strings(keys)
|
||
for _, key := range keys {
|
||
profile := profiles[key]
|
||
precision := BenchmarkPrecisionResult{
|
||
Name: profile.name,
|
||
Category: profile.category,
|
||
Supported: profile.supported,
|
||
Lanes: profile.lanes,
|
||
M: profile.m,
|
||
N: profile.n,
|
||
K: profile.k,
|
||
Iterations: profile.iterations,
|
||
Notes: profile.notes,
|
||
}
|
||
w := precisionWeight(profile.category)
|
||
precision.Weight = w
|
||
if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
|
||
precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
|
||
precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w
|
||
}
|
||
result.Profiles = append(result.Profiles, precision)
|
||
}
|
||
return result
|
||
}
|
||
|
||
func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
|
||
if profile, ok := profiles[name]; ok {
|
||
return profile
|
||
}
|
||
category := "other"
|
||
switch {
|
||
case strings.HasPrefix(name, "fp64"):
|
||
category = "fp64"
|
||
case strings.HasPrefix(name, "fp32"):
|
||
category = "fp32_tf32"
|
||
case strings.HasPrefix(name, "fp16"):
|
||
category = "fp16_bf16"
|
||
case strings.HasPrefix(name, "int8"):
|
||
category = "int8"
|
||
case strings.HasPrefix(name, "fp8"):
|
||
category = "fp8"
|
||
case strings.HasPrefix(name, "fp4"):
|
||
category = "fp4"
|
||
}
|
||
profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
|
||
profiles[name] = profile
|
||
return profile
|
||
}
|
||
|
||
// precisionWeight returns the fp32-equivalence factor for a precision category.
|
||
// Each factor represents how much "real" numeric work one operation of that
|
||
// type performs relative to fp32 (single precision = 1.0 baseline):
|
||
//
|
||
// fp64 = 2.0 — double precision, 2× more bits per operand
|
||
// fp32 = 1.0 — single precision baseline
|
||
// fp16 = 0.5 — half precision
|
||
// int8 = 0.25 — quarter precision
|
||
// fp8 = 0.25 — quarter precision
|
||
// fp4 = 0.125 — eighth precision
|
||
//
|
||
// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
|
||
// cross-precision comparison on the same numeric scale.
|
||
func precisionWeight(category string) float64 {
|
||
switch category {
|
||
case "fp64":
|
||
return 2.0
|
||
case "fp32_tf32":
|
||
return 1.0
|
||
case "fp16_bf16":
|
||
return 0.5
|
||
case "int8":
|
||
return 0.25
|
||
case "fp8":
|
||
return 0.25
|
||
case "fp4":
|
||
return 0.125
|
||
default:
|
||
return 1.0
|
||
}
|
||
}
|
||
|
||
func stripBenchmarkPrefix(line string) string {
|
||
if strings.HasPrefix(line, "[gpu ") {
|
||
if idx := strings.Index(line, "] "); idx >= 0 {
|
||
return line[idx+2:]
|
||
}
|
||
}
|
||
return line
|
||
}
|
||
|
||
func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
|
||
summary := BenchmarkTelemetrySummary{}
|
||
if len(rows) == 0 {
|
||
return summary
|
||
}
|
||
temps := make([]float64, 0, len(rows))
|
||
powers := make([]float64, 0, len(rows))
|
||
clocks := make([]float64, 0, len(rows))
|
||
memClocks := make([]float64, 0, len(rows))
|
||
usages := make([]float64, 0, len(rows))
|
||
memUsages := make([]float64, 0, len(rows))
|
||
summary.DurationSec = rows[len(rows)-1].ElapsedSec
|
||
summary.Samples = len(rows)
|
||
for _, row := range rows {
|
||
temps = append(temps, row.TempC)
|
||
powers = append(powers, row.PowerW)
|
||
clocks = append(clocks, row.ClockMHz)
|
||
memClocks = append(memClocks, row.MemClockMHz)
|
||
usages = append(usages, row.UsagePct)
|
||
memUsages = append(memUsages, row.MemUsagePct)
|
||
}
|
||
summary.AvgTempC = benchmarkMean(temps)
|
||
summary.P95TempC = benchmarkPercentile(temps, 95)
|
||
summary.AvgPowerW = benchmarkMean(powers)
|
||
summary.P95PowerW = benchmarkPercentile(powers, 95)
|
||
summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
|
||
summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
|
||
summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
|
||
summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
|
||
summary.AvgUsagePct = benchmarkMean(usages)
|
||
summary.AvgMemUsagePct = benchmarkMean(memUsages)
|
||
summary.ClockCVPct = benchmarkCV(clocks)
|
||
summary.PowerCVPct = benchmarkCV(powers)
|
||
summary.TempCVPct = benchmarkCV(temps)
|
||
summary.ClockDriftPct = benchmarkClockDrift(clocks)
|
||
return summary
|
||
}
|
||
|
||
func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
|
||
if len(rows) == 0 {
|
||
return nil
|
||
}
|
||
var rpmValues []float64
|
||
var dutyValues []float64
|
||
for _, row := range rows {
|
||
if row.FanAvgRPM > 0 {
|
||
rpmValues = append(rpmValues, row.FanAvgRPM)
|
||
}
|
||
if row.FanDutyCycleAvailable {
|
||
dutyValues = append(dutyValues, row.FanDutyCyclePct)
|
||
}
|
||
}
|
||
if len(rpmValues) == 0 && len(dutyValues) == 0 {
|
||
return nil
|
||
}
|
||
summary := &BenchmarkCoolingSummary{
|
||
Available: true,
|
||
AvgFanRPM: benchmarkMean(rpmValues),
|
||
}
|
||
if len(dutyValues) > 0 {
|
||
summary.FanDutyCycleAvailable = true
|
||
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
|
||
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
|
||
} else {
|
||
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
|
||
}
|
||
return summary
|
||
}
|
||
|
||
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||
score := BenchmarkScorecard{}
|
||
|
||
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
|
||
// Each precision ran alone with full GPU dedicated — peak capability.
|
||
for _, p := range gpu.PrecisionSteady {
|
||
score.SyntheticScore += p.WeightedTeraOpsPerSec
|
||
}
|
||
|
||
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
|
||
// All precisions compete simultaneously — closer to real inference workloads.
|
||
for _, p := range gpu.PrecisionResults {
|
||
if p.Supported {
|
||
score.MixedScore += p.WeightedTeraOpsPerSec
|
||
}
|
||
}
|
||
|
||
// MixedEfficiency = MixedScore / SyntheticScore.
|
||
// Measures how well the GPU sustains throughput under concurrent mixed load.
|
||
// A healthy GPU scores ~0.8–0.95; severe degradation suggests bandwidth
|
||
// contention or scheduler inefficiency.
|
||
if score.SyntheticScore > 0 && score.MixedScore > 0 {
|
||
score.MixedEfficiency = score.MixedScore / score.SyntheticScore
|
||
}
|
||
|
||
// ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3).
|
||
// SyntheticScore is the primary signal; MixedEfficiency adds up to +30%
|
||
// bonus for GPUs that handle mixed-precision concurrency well.
|
||
// Falls back to MixedScore alone when per-precision data is absent.
|
||
switch {
|
||
case score.SyntheticScore > 0:
|
||
score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3)
|
||
case score.MixedScore > 0:
|
||
score.ComputeScore = score.MixedScore
|
||
}
|
||
// PowerSustainScore: measures how close the GPU came to its rated TDP under
|
||
// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
|
||
// Penalty applied symmetrically for both under- and over-TDP deviations:
|
||
// score = max(0, 100 − |measured − rated| / rated × 100)
|
||
// Under-TDP → power delivery / cooling issue.
|
||
// Over-TDP → power limit not properly enforced / power regulation fault.
|
||
// Falls back to 0 if calibration was not performed (dcgmi unavailable).
|
||
{
|
||
ref := gpu.DefaultPowerLimitW
|
||
if ref <= 0 {
|
||
ref = gpu.PowerLimitW
|
||
}
|
||
if gpu.CalibratedPeakPowerW > 0 && ref > 0 {
|
||
deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100
|
||
score.PowerSustainScore = clampScore(100 - deviationPct)
|
||
}
|
||
}
|
||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
|
||
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
||
// StabilityScore: prefer per-precision steady phases where each window runs a
|
||
// single kernel type so PowerCVPct is a genuine stability signal (not a
|
||
// workload-mix artifact). Fall back to combined steady using clock-only metrics
|
||
// when per-precision data is absent (older results, short profiles).
|
||
if len(gpu.PrecisionSteady) > 0 {
|
||
var sum float64
|
||
for _, p := range gpu.PrecisionSteady {
|
||
sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2))
|
||
}
|
||
score.StabilityScore = sum / float64(len(gpu.PrecisionSteady))
|
||
} else {
|
||
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2))
|
||
}
|
||
score.CompositeScore = compositeBenchmarkScore(score)
|
||
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
|
||
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
|
||
}
|
||
return score
|
||
}
|
||
|
||
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
||
// Weights after introducing calibrated power reference:
|
||
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
||
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
|
||
// stability 0.25 — clock/power variance matters for reproducibility
|
||
// power 0.15 — GPU reaches rated TDP under targeted_power? lower weight
|
||
// because calibration may be absent (dcgmi not installed)
|
||
// NCCL bonus 0.10 — interconnect health
|
||
// cap 1.10
|
||
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
|
||
if score.InterconnectScore > 0 {
|
||
quality += 0.10
|
||
}
|
||
if quality > 1.10 {
|
||
quality = 1.10
|
||
}
|
||
return score.ComputeScore * quality
|
||
}
|
||
|
||
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
||
var reasons []string
|
||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||
if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
|
||
reasons = append(reasons, "power_capped")
|
||
}
|
||
if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
|
||
reasons = append(reasons, "thermal_limited")
|
||
}
|
||
if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
|
||
reasons = append(reasons, "sync_boost_limited")
|
||
}
|
||
if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
|
||
reasons = append(reasons, "low_sm_clock_vs_target")
|
||
}
|
||
if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
|
||
reasons = append(reasons, "variance_too_high")
|
||
}
|
||
if normalizationStatus != "full" {
|
||
reasons = append(reasons, "normalization_partial")
|
||
}
|
||
if gpu.PowerLimitDerated {
|
||
reasons = append(reasons, "power_limit_derated")
|
||
}
|
||
if gpu.ECC.Uncorrected > 0 {
|
||
reasons = append(reasons, "ecc_uncorrected_errors")
|
||
}
|
||
if gpu.ECC.Corrected > 0 {
|
||
reasons = append(reasons, "ecc_corrected_errors")
|
||
}
|
||
return dedupeStrings(reasons)
|
||
}
|
||
|
||
func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
|
||
result := &BenchmarkInterconnectResult{
|
||
Status: "UNSUPPORTED",
|
||
Attempted: true,
|
||
SelectedGPUIndices: append([]int(nil), gpuIndices...),
|
||
}
|
||
cmd := []string{
|
||
"all_reduce_perf",
|
||
"-b", "512M",
|
||
"-e", "4G",
|
||
"-f", "2",
|
||
"-g", strconv.Itoa(len(gpuIndices)),
|
||
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
||
}
|
||
env := []string{
|
||
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||
}
|
||
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
||
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
||
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
||
if err != nil {
|
||
result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
|
||
return result
|
||
}
|
||
avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
|
||
result.Status = "OK"
|
||
result.Supported = true
|
||
result.AvgAlgBWGBps = avgAlg
|
||
result.MaxAlgBWGBps = maxAlg
|
||
result.AvgBusBWGBps = avgBus
|
||
result.MaxBusBWGBps = maxBus
|
||
return result
|
||
}
|
||
|
||
func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
|
||
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
|
||
var algs []float64
|
||
var buses []float64
|
||
for _, line := range lines {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" || strings.HasPrefix(line, "#") {
|
||
continue
|
||
}
|
||
fields := strings.Fields(line)
|
||
if len(fields) < 8 {
|
||
continue
|
||
}
|
||
for i := 0; i+2 < len(fields); i++ {
|
||
timeVal, err1 := strconv.ParseFloat(fields[i], 64)
|
||
algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
|
||
busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
|
||
if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
|
||
algs = append(algs, algVal)
|
||
buses = append(buses, busVal)
|
||
break
|
||
}
|
||
}
|
||
}
|
||
if len(algs) == 0 {
|
||
return 0, 0, 0, 0
|
||
}
|
||
return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
|
||
}
|
||
|
||
func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
|
||
out, err := satExecCommand(
|
||
"nvidia-smi",
|
||
"--id="+strconv.Itoa(gpuIndex),
|
||
"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
|
||
"--format=csv,noheader,nounits",
|
||
).Output()
|
||
if err != nil {
|
||
return BenchmarkThrottleCounters{}, err
|
||
}
|
||
fields := strings.Split(strings.TrimSpace(string(out)), ",")
|
||
if len(fields) < 5 {
|
||
return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
|
||
}
|
||
return BenchmarkThrottleCounters{
|
||
SWPowerCapUS: parseBenchmarkUint64(fields[0]),
|
||
SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]),
|
||
SyncBoostUS: parseBenchmarkUint64(fields[2]),
|
||
HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]),
|
||
HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
|
||
}, nil
|
||
}
|
||
|
||
func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
|
||
return BenchmarkThrottleCounters{
|
||
SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
|
||
SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
|
||
SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
|
||
HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
|
||
HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
|
||
}
|
||
}
|
||
|
||
func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) {
|
||
out, err := satExecCommand(
|
||
"nvidia-smi",
|
||
"--id="+strconv.Itoa(gpuIndex),
|
||
"--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total",
|
||
"--format=csv,noheader,nounits",
|
||
).Output()
|
||
if err != nil {
|
||
return BenchmarkECCCounters{}, err
|
||
}
|
||
fields := strings.Split(strings.TrimSpace(string(out)), ",")
|
||
if len(fields) < 2 {
|
||
return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out)))
|
||
}
|
||
corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64)
|
||
uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64)
|
||
if err1 != nil || err2 != nil {
|
||
// ECC may be disabled on this GPU — return zero counters silently.
|
||
return BenchmarkECCCounters{}, nil
|
||
}
|
||
return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil
|
||
}
|
||
|
||
func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters {
|
||
return BenchmarkECCCounters{
|
||
Corrected: saturatingSub(after.Corrected, before.Corrected),
|
||
Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected),
|
||
}
|
||
}
|
||
|
||
func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
|
||
args := []string{
|
||
"--query-compute-apps=gpu_uuid,pid,process_name",
|
||
"--format=csv,noheader,nounits",
|
||
}
|
||
if len(gpuIndices) > 0 {
|
||
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
||
}
|
||
out, err := satExecCommand("nvidia-smi", args...).Output()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
var lines []string
|
||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
lines = append(lines, line)
|
||
}
|
||
return lines, nil
|
||
}
|
||
|
||
func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
|
||
if gpu.Status == "" {
|
||
gpu.Status = "OK"
|
||
}
|
||
if gpu.Scores.CompositeScore == 0 {
|
||
gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
|
||
}
|
||
return gpu
|
||
}
|
||
|
||
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||
var findings []string
|
||
|
||
passed := 0
|
||
for _, gpu := range result.GPUs {
|
||
if gpu.Status == "OK" {
|
||
passed++
|
||
}
|
||
}
|
||
total := len(result.GPUs)
|
||
if total > 0 {
|
||
if passed == total {
|
||
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
|
||
} else {
|
||
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
|
||
}
|
||
}
|
||
|
||
if result.Normalization.Status != "full" {
|
||
findings = append(findings, "Environment normalization was partial; compare results with caution.")
|
||
}
|
||
for _, gpu := range result.GPUs {
|
||
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
|
||
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
|
||
continue
|
||
}
|
||
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
|
||
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
|
||
continue
|
||
}
|
||
for _, reason := range gpu.DegradationReasons {
|
||
switch reason {
|
||
case "power_capped":
|
||
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
|
||
case "thermal_limited":
|
||
msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
|
||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
|
||
result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
|
||
msg += fmt.Sprintf(
|
||
" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
|
||
result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
|
||
)
|
||
}
|
||
findings = append(findings, msg)
|
||
case "sync_boost_limited":
|
||
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
|
||
case "low_sm_clock_vs_target":
|
||
findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
|
||
case "variance_too_high":
|
||
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
|
||
case "normalization_partial":
|
||
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
|
||
case "power_limit_derated":
|
||
findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW))
|
||
case "ecc_uncorrected_errors":
|
||
findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
|
||
case "ecc_corrected_errors":
|
||
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
|
||
}
|
||
}
|
||
if gpu.CoolingWarning != "" {
|
||
findings = append(findings, fmt.Sprintf(
|
||
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
|
||
gpu.Index, gpu.CoolingWarning,
|
||
))
|
||
}
|
||
if len(gpu.PrecisionFailures) > 0 {
|
||
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
|
||
}
|
||
if gpu.Backend == "driver-ptx" {
|
||
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
||
}
|
||
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
|
||
findings = append(findings, fmt.Sprintf(
|
||
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
|
||
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
|
||
))
|
||
}
|
||
// Flag significant TDP deviation (over or under) from calibration.
|
||
if gpu.CalibratedPeakPowerW > 0 {
|
||
ref := gpu.DefaultPowerLimitW
|
||
if ref <= 0 {
|
||
ref = gpu.PowerLimitW
|
||
}
|
||
if ref > 0 {
|
||
deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100
|
||
switch {
|
||
case deviationPct < -10:
|
||
findings = append(findings, fmt.Sprintf(
|
||
"GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.",
|
||
gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref,
|
||
))
|
||
case deviationPct > 5:
|
||
findings = append(findings, fmt.Sprintf(
|
||
"GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.",
|
||
gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct,
|
||
))
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if result.Interconnect != nil && result.Interconnect.Supported {
|
||
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
||
}
|
||
if cl := result.CPULoad; cl != nil {
|
||
switch cl.Status {
|
||
case "high":
|
||
findings = append(findings, fmt.Sprintf(
|
||
"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
|
||
cl.AvgPct, cl.MaxPct,
|
||
))
|
||
case "unstable":
|
||
findings = append(findings, fmt.Sprintf(
|
||
"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
|
||
cl.AvgPct, cl.P95Pct,
|
||
))
|
||
}
|
||
}
|
||
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||
if sp.ReportingRatio < 0.75 {
|
||
findings = append(findings, fmt.Sprintf(
|
||
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.",
|
||
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
|
||
))
|
||
} else if sp.ReportingRatio > 1.25 {
|
||
findings = append(findings, fmt.Sprintf(
|
||
"Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.",
|
||
sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100,
|
||
))
|
||
}
|
||
}
|
||
return dedupeStrings(findings)
|
||
}
|
||
|
||
func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
|
||
if len(result.GPUs) == 0 {
|
||
return "FAILED"
|
||
}
|
||
hasOK := false
|
||
hasPartial := result.Normalization.Status != "full"
|
||
for _, gpu := range result.GPUs {
|
||
switch gpu.Status {
|
||
case "OK":
|
||
hasOK = true
|
||
case "PARTIAL", "UNSUPPORTED":
|
||
hasPartial = true
|
||
}
|
||
}
|
||
if !hasOK {
|
||
return "FAILED"
|
||
}
|
||
if hasPartial {
|
||
return "PARTIAL"
|
||
}
|
||
return "OK"
|
||
}
|
||
|
||
func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
|
||
for i := range items {
|
||
if items[i].Index == idx {
|
||
return &items[i]
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func classifySATErrorStatus(out []byte, err error) string {
|
||
status, _ := classifySATResult("benchmark", out, err)
|
||
if status == "UNSUPPORTED" {
|
||
return "UNSUPPORTED"
|
||
}
|
||
return "FAILED"
|
||
}
|
||
|
||
func parseBenchmarkFloat(raw string) float64 {
|
||
raw = strings.TrimSpace(raw)
|
||
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
|
||
return 0
|
||
}
|
||
value, _ := strconv.ParseFloat(raw, 64)
|
||
return value
|
||
}
|
||
|
||
func parseBenchmarkUint64(raw string) uint64 {
|
||
raw = strings.TrimSpace(raw)
|
||
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
|
||
return 0
|
||
}
|
||
value, _ := strconv.ParseUint(raw, 10, 64)
|
||
return value
|
||
}
|
||
|
||
func benchmarkMean(values []float64) float64 {
|
||
if len(values) == 0 {
|
||
return 0
|
||
}
|
||
var sum float64
|
||
for _, value := range values {
|
||
sum += value
|
||
}
|
||
return sum / float64(len(values))
|
||
}
|
||
|
||
func benchmarkPercentile(values []float64, p float64) float64 {
|
||
if len(values) == 0 {
|
||
return 0
|
||
}
|
||
copyValues := append([]float64(nil), values...)
|
||
sort.Float64s(copyValues)
|
||
if len(copyValues) == 1 {
|
||
return copyValues[0]
|
||
}
|
||
rank := (p / 100.0) * float64(len(copyValues)-1)
|
||
lower := int(math.Floor(rank))
|
||
upper := int(math.Ceil(rank))
|
||
if lower == upper {
|
||
return copyValues[lower]
|
||
}
|
||
frac := rank - float64(lower)
|
||
return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
|
||
}
|
||
|
||
func benchmarkCV(values []float64) float64 {
|
||
if len(values) == 0 {
|
||
return 0
|
||
}
|
||
mean := benchmarkMean(values)
|
||
if mean == 0 {
|
||
return 0
|
||
}
|
||
var variance float64
|
||
for _, value := range values {
|
||
diff := value - mean
|
||
variance += diff * diff
|
||
}
|
||
variance /= float64(len(values))
|
||
return math.Sqrt(variance) / mean * 100
|
||
}
|
||
|
||
func benchmarkClockDrift(values []float64) float64 {
|
||
if len(values) < 4 {
|
||
return 0
|
||
}
|
||
window := len(values) / 4
|
||
if window < 1 {
|
||
window = 1
|
||
}
|
||
head := benchmarkMean(values[:window])
|
||
tail := benchmarkMean(values[len(values)-window:])
|
||
if head <= 0 || tail >= head {
|
||
return 0
|
||
}
|
||
return ((head - tail) / head) * 100
|
||
}
|
||
|
||
func benchmarkMax(values []float64) float64 {
|
||
var max float64
|
||
for i, value := range values {
|
||
if i == 0 || value > max {
|
||
max = value
|
||
}
|
||
}
|
||
return max
|
||
}
|
||
|
||
func clampScore(value float64) float64 {
|
||
switch {
|
||
case value < 0:
|
||
return 0
|
||
case value > 100:
|
||
return 100
|
||
default:
|
||
return value
|
||
}
|
||
}
|
||
|
||
func dedupeStrings(values []string) []string {
|
||
if len(values) == 0 {
|
||
return nil
|
||
}
|
||
seen := make(map[string]struct{}, len(values))
|
||
out := make([]string, 0, len(values))
|
||
for _, value := range values {
|
||
value = strings.TrimSpace(value)
|
||
if value == "" {
|
||
continue
|
||
}
|
||
if _, ok := seen[value]; ok {
|
||
continue
|
||
}
|
||
seen[value] = struct{}{}
|
||
out = append(out, value)
|
||
}
|
||
return out
|
||
}
|
||
|
||
func saturatingSub(after, before uint64) uint64 {
|
||
if after <= before {
|
||
return 0
|
||
}
|
||
return after - before
|
||
}
|
||
|
||
func maxInt(a, b int) int {
|
||
if a > b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|
||
|
||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||
func queryIPMIServerPowerW() (float64, error) {
|
||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||
defer cancel()
|
||
cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading")
|
||
out, err := cmd.Output()
|
||
if err != nil {
|
||
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||
}
|
||
if w := parseDCMIPowerReading(string(out)); w > 0 {
|
||
return w, nil
|
||
}
|
||
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
||
}
|
||
|
||
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
|
||
// durationSec seconds. Returns the mean of all successful samples.
|
||
// Returns 0, false if IPMI is unavailable.
|
||
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
|
||
if durationSec <= 0 {
|
||
return 0, false
|
||
}
|
||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||
var samples []float64
|
||
loop:
|
||
for {
|
||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||
samples = append(samples, w)
|
||
}
|
||
if time.Now().After(deadline) {
|
||
break
|
||
}
|
||
select {
|
||
case <-ctx.Done():
|
||
break loop
|
||
case <-time.After(2 * time.Second):
|
||
}
|
||
}
|
||
if len(samples) == 0 {
|
||
return 0, false
|
||
}
|
||
var sum float64
|
||
for _, w := range samples {
|
||
sum += w
|
||
}
|
||
return sum / float64(len(samples)), true
|
||
}
|
||
|
||
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
||
// IPMI samples plus the GPU-reported average power during steady state.
|
||
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
|
||
sp := &BenchmarkServerPower{Available: ipmiAvailable}
|
||
if !ipmiAvailable {
|
||
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
||
return sp
|
||
}
|
||
sp.IdleW = idleW
|
||
sp.LoadedW = loadedW
|
||
sp.DeltaW = loadedW - idleW
|
||
sp.GPUReportedSumW = gpuReportedSumW
|
||
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
|
||
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
|
||
}
|
||
return sp
|
||
}
|
||
|
||
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
|
||
// Returns empty string if unavailable (non-Linux or missing DMI entry).
|
||
func readServerModel() string {
|
||
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
|
||
if err != nil {
|
||
return ""
|
||
}
|
||
return strings.TrimSpace(string(data))
|
||
}
|
||
|
||
// filterRowsByGPU returns only the metric rows for a specific GPU index.
|
||
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
|
||
var out []GPUMetricRow
|
||
for _, r := range rows {
|
||
if r.GPUIndex == gpuIndex {
|
||
out = append(out, r)
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
|
||
// and returns a per-GPU parse result map.
|
||
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
|
||
gpuLines := make(map[int][]string)
|
||
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
|
||
line = strings.TrimSpace(line)
|
||
if !strings.HasPrefix(line, "[gpu ") {
|
||
continue
|
||
}
|
||
end := strings.Index(line, "] ")
|
||
if end < 0 {
|
||
continue
|
||
}
|
||
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
|
||
if err != nil {
|
||
continue
|
||
}
|
||
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
|
||
}
|
||
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
|
||
for gpuIdx, lines := range gpuLines {
|
||
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
|
||
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
|
||
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
|
||
}
|
||
return results
|
||
}
|
||
|
||
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
|
||
// simultaneously using a single bee-gpu-burn invocation per phase.
|
||
func runNvidiaBenchmarkParallel(
|
||
ctx context.Context,
|
||
verboseLog, runDir string,
|
||
selected []int,
|
||
infoByIndex map[int]benchmarkGPUInfo,
|
||
opts NvidiaBenchmarkOptions,
|
||
spec benchmarkProfileSpec,
|
||
logFunc func(string),
|
||
result *NvidiaBenchmarkResult,
|
||
calibByIndex map[int]benchmarkPowerCalibrationResult,
|
||
serverIdleW *float64, serverLoadedWSum *float64,
|
||
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
||
allMetricRows *[]GPUMetricRow,
|
||
metricTimelineSec *float64,
|
||
gpuBurnLog string,
|
||
) {
|
||
allDevices := joinIndexList(selected)
|
||
|
||
// Build per-GPU result stubs.
|
||
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
|
||
for _, idx := range selected {
|
||
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
|
||
if info, ok := infoByIndex[idx]; ok {
|
||
r.UUID = info.UUID
|
||
r.Name = info.Name
|
||
r.BusID = info.BusID
|
||
r.VBIOS = info.VBIOS
|
||
r.PowerLimitW = info.PowerLimitW
|
||
r.MultiprocessorCount = info.MultiprocessorCount
|
||
r.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||
}
|
||
if calib, ok := calibByIndex[idx]; ok {
|
||
r.CalibratedPeakPowerW = calib.Summary.P95PowerW
|
||
r.CalibratedPeakTempC = calib.Summary.P95TempC
|
||
r.PowerCalibrationTries = calib.Attempts
|
||
r.PowerLimitDerated = calib.Derated
|
||
r.Notes = append(r.Notes, calib.Notes...)
|
||
if calib.CoolingWarning != "" {
|
||
r.CoolingWarning = calib.CoolingWarning
|
||
}
|
||
}
|
||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||
}
|
||
gpuResults[idx] = r
|
||
}
|
||
|
||
// Baseline: sample all GPUs together.
|
||
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
|
||
if err != nil && err != context.Canceled {
|
||
for _, idx := range selected {
|
||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
|
||
}
|
||
}
|
||
for _, idx := range selected {
|
||
perGPU := filterRowsByGPU(baselineRows, idx)
|
||
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
|
||
}
|
||
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec))
|
||
|
||
// Sample server idle power once.
|
||
if !*serverIdleOK {
|
||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||
*serverIdleW = w
|
||
*serverIdleOK = true
|
||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||
}
|
||
}
|
||
|
||
// Warmup: all GPUs simultaneously.
|
||
warmupCmd := []string{
|
||
"bee-gpu-burn",
|
||
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||
"--devices", allDevices,
|
||
}
|
||
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
|
||
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
|
||
appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec))
|
||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
|
||
if warmupErr != nil {
|
||
for _, idx := range selected {
|
||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
|
||
}
|
||
}
|
||
warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut))
|
||
supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...)
|
||
for _, idx := range selected {
|
||
if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" {
|
||
if gpuResults[idx].ComputeCapability == "" {
|
||
gpuResults[idx].ComputeCapability = pr.ComputeCapability
|
||
}
|
||
if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) {
|
||
supportedPrecisions = ccPrecisions
|
||
}
|
||
}
|
||
}
|
||
|
||
// Run synthetic precision phases and the combined steady phase as one
|
||
// uninterrupted command so the GPUs stay hot between windows.
|
||
eccBase := make(map[int]BenchmarkECCCounters, len(selected))
|
||
for _, idx := range selected {
|
||
eccBase[idx], _ = queryECCCounters(idx)
|
||
}
|
||
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
|
||
if label == "mixed" {
|
||
return "steady"
|
||
}
|
||
return "gpu-all-steady-" + label
|
||
})
|
||
planCmd := []string{
|
||
"bee-gpu-burn",
|
||
"--seconds", strconv.Itoa(basePhaseSec),
|
||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||
"--devices", allDevices,
|
||
"--precision-plan", strings.Join(planLabels, ","),
|
||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||
}
|
||
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
||
for _, phaseSpec := range planPhases {
|
||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
|
||
}
|
||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
|
||
}
|
||
for _, prec := range supportedPrecisions {
|
||
phaseLogName := "gpu-all-steady-" + prec
|
||
phaseRows := phaseRowsByStage[phaseLogName]
|
||
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
|
||
for _, idx := range selected {
|
||
perGPU := filterRowsByGPU(phaseRows, idx)
|
||
phase := BenchmarkPrecisionSteadyPhase{
|
||
Precision: prec,
|
||
Status: "OK",
|
||
Steady: summarizeBenchmarkTelemetry(perGPU),
|
||
}
|
||
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
|
||
phase.Status = status
|
||
phase.Notes = note
|
||
gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status)
|
||
}
|
||
if pr, ok := parseByGPU[idx]; ok {
|
||
for _, p := range pr.Profiles {
|
||
if p.Supported {
|
||
phase.TeraOpsPerSec += p.TeraOpsPerSec
|
||
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
|
||
}
|
||
}
|
||
}
|
||
gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase)
|
||
}
|
||
}
|
||
|
||
// Snapshot throttle counters before steady.
|
||
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||
for _, idx := range selected {
|
||
beforeThrottle[idx], _ = queryThrottleCounters(idx)
|
||
}
|
||
|
||
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
|
||
|
||
// Sample server power via IPMI in parallel with steady phase.
|
||
ipmiStopCh := make(chan struct{})
|
||
ipmiResultCh := make(chan float64, 1)
|
||
go func() {
|
||
defer close(ipmiResultCh)
|
||
var samples []float64
|
||
ticker := time.NewTicker(5 * time.Second)
|
||
defer ticker.Stop()
|
||
select {
|
||
case <-ipmiStopCh:
|
||
return
|
||
case <-time.After(15 * time.Second):
|
||
}
|
||
for {
|
||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||
samples = append(samples, w)
|
||
}
|
||
select {
|
||
case <-ipmiStopCh:
|
||
if len(samples) > 0 {
|
||
var sum float64
|
||
for _, w := range samples {
|
||
sum += w
|
||
}
|
||
ipmiResultCh <- sum / float64(len(samples))
|
||
}
|
||
return
|
||
case <-ticker.C:
|
||
}
|
||
}
|
||
}()
|
||
|
||
close(ipmiStopCh)
|
||
if loadedW, ok := <-ipmiResultCh; ok {
|
||
*serverLoadedWSum += loadedW
|
||
(*serverLoadedSamples)++
|
||
*serverLoadedOK = true
|
||
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
||
}
|
||
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||
for _, idx := range selected {
|
||
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||
}
|
||
|
||
steadyRows := phaseRowsByStage["steady"]
|
||
parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))
|
||
|
||
for _, idx := range selected {
|
||
perGPU := filterRowsByGPU(steadyRows, idx)
|
||
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
|
||
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
|
||
if eccFinal, err := queryECCCounters(idx); err == nil {
|
||
gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal)
|
||
}
|
||
|
||
if pr, ok := parseResults[idx]; ok {
|
||
gpuResults[idx].ComputeCapability = pr.ComputeCapability
|
||
gpuResults[idx].Backend = pr.Backend
|
||
gpuResults[idx].PrecisionResults = pr.Profiles
|
||
if pr.Fallback {
|
||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
||
}
|
||
}
|
||
if planErr != nil {
|
||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
|
||
}
|
||
}
|
||
|
||
// Cooldown: all GPUs together.
|
||
if spec.CooldownSec > 0 {
|
||
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
|
||
if err != nil && err != context.Canceled {
|
||
for _, idx := range selected {
|
||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
|
||
}
|
||
}
|
||
for _, idx := range selected {
|
||
perGPU := filterRowsByGPU(cooldownRows, idx)
|
||
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
||
}
|
||
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec))
|
||
}
|
||
|
||
// Score and finalize each GPU.
|
||
for _, idx := range selected {
|
||
r := gpuResults[idx]
|
||
r.Scores = scoreBenchmarkGPUResult(*r)
|
||
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
||
pr := parseResults[idx]
|
||
switch {
|
||
case planErr != nil:
|
||
r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||
case len(r.PrecisionFailures) > 0:
|
||
r.Status = "PARTIAL"
|
||
case pr.Fallback:
|
||
r.Status = "PARTIAL"
|
||
default:
|
||
r.Status = "OK"
|
||
}
|
||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
||
}
|
||
}
|
||
|
||
// readBenchmarkHostConfig reads static CPU and memory configuration from
|
||
// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
|
||
func readBenchmarkHostConfig() *BenchmarkHostConfig {
|
||
cfg := &BenchmarkHostConfig{}
|
||
populated := false
|
||
|
||
// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
|
||
if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
|
||
socketIDs := map[string]struct{}{}
|
||
coresPerSocket := map[string]int{}
|
||
var modelName string
|
||
threads := 0
|
||
for _, line := range strings.Split(string(data), "\n") {
|
||
kv := strings.SplitN(line, ":", 2)
|
||
if len(kv) != 2 {
|
||
continue
|
||
}
|
||
key := strings.TrimSpace(kv[0])
|
||
val := strings.TrimSpace(kv[1])
|
||
switch key {
|
||
case "processor":
|
||
threads++
|
||
case "model name":
|
||
if modelName == "" {
|
||
modelName = val
|
||
}
|
||
case "physical id":
|
||
socketIDs[val] = struct{}{}
|
||
case "cpu cores":
|
||
// Overwrite per-socket core count (last wins per socket, but all
|
||
// entries for the same socket report the same value).
|
||
if physLine := ""; physLine == "" {
|
||
// We accumulate below by treating cpu cores as a per-thread
|
||
// field; sum by socket requires a two-pass approach. Use the
|
||
// simpler approximation: totalCores = threads / (threads per core).
|
||
_ = val
|
||
}
|
||
}
|
||
}
|
||
// Second pass: per-socket core count.
|
||
var curSocket string
|
||
for _, line := range strings.Split(string(data), "\n") {
|
||
kv := strings.SplitN(line, ":", 2)
|
||
if len(kv) != 2 {
|
||
continue
|
||
}
|
||
key := strings.TrimSpace(kv[0])
|
||
val := strings.TrimSpace(kv[1])
|
||
switch key {
|
||
case "physical id":
|
||
curSocket = val
|
||
case "cpu cores":
|
||
if curSocket != "" {
|
||
if _, seen := coresPerSocket[curSocket]; !seen {
|
||
v, _ := strconv.Atoi(val)
|
||
coresPerSocket[curSocket] = v
|
||
}
|
||
}
|
||
}
|
||
}
|
||
totalCores := 0
|
||
for _, c := range coresPerSocket {
|
||
totalCores += c
|
||
}
|
||
cfg.CPUModel = modelName
|
||
cfg.CPUSockets = len(socketIDs)
|
||
if cfg.CPUSockets == 0 && threads > 0 {
|
||
cfg.CPUSockets = 1
|
||
}
|
||
cfg.CPUCores = totalCores
|
||
cfg.CPUThreads = threads
|
||
if modelName != "" || threads > 0 {
|
||
populated = true
|
||
}
|
||
}
|
||
|
||
// Parse /proc/meminfo for total physical RAM.
|
||
if data, err := os.ReadFile("/proc/meminfo"); err == nil {
|
||
for _, line := range strings.Split(string(data), "\n") {
|
||
if strings.HasPrefix(line, "MemTotal:") {
|
||
fields := strings.Fields(line)
|
||
if len(fields) >= 2 {
|
||
kb, _ := strconv.ParseUint(fields[1], 10, 64)
|
||
cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
|
||
populated = true
|
||
}
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
if !populated {
|
||
return nil
|
||
}
|
||
return cfg
|
||
}
|
||
|
||
// startCPULoadSampler starts a goroutine that samples host CPU load every
|
||
// intervalSec seconds until stopCh is closed, then sends the collected
|
||
// samples on the returned channel.
|
||
func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
|
||
ch := make(chan []float64, 1)
|
||
go func() {
|
||
var samples []float64
|
||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||
defer ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-stopCh:
|
||
ch <- samples
|
||
return
|
||
case <-ticker.C:
|
||
if pct := sampleCPULoadPct(); pct > 0 {
|
||
samples = append(samples, pct)
|
||
}
|
||
}
|
||
}
|
||
}()
|
||
return ch
|
||
}
|
||
|
||
// summarizeCPULoad computes stats over sampled CPU load values and assigns
|
||
// a health status.
|
||
func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
||
if len(samples) == 0 {
|
||
return nil
|
||
}
|
||
sorted := append([]float64(nil), samples...)
|
||
sort.Float64s(sorted)
|
||
var sum float64
|
||
for _, v := range sorted {
|
||
sum += v
|
||
}
|
||
avg := sum / float64(len(sorted))
|
||
p95 := sorted[int(float64(len(sorted))*0.95)]
|
||
max := sorted[len(sorted)-1]
|
||
|
||
cl := &BenchmarkCPULoad{
|
||
AvgPct: math.Round(avg*10) / 10,
|
||
MaxPct: math.Round(max*10) / 10,
|
||
P95Pct: math.Round(p95*10) / 10,
|
||
Samples: len(sorted),
|
||
}
|
||
|
||
// Compute standard deviation to detect instability.
|
||
var variance float64
|
||
for _, v := range sorted {
|
||
d := v - avg
|
||
variance += d * d
|
||
}
|
||
stdDev := math.Sqrt(variance / float64(len(sorted)))
|
||
|
||
switch {
|
||
case avg > 20 || max > 40:
|
||
cl.Status = "high"
|
||
cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
|
||
case stdDev > 12:
|
||
cl.Status = "unstable"
|
||
cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
|
||
default:
|
||
cl.Status = "ok"
|
||
}
|
||
return cl
|
||
}
|
||
|
||
// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
|
||
// throttle counters. If a GPU starts throttling, the current targeted_power run
|
||
// is canceled immediately, the power limit is reduced, and a fresh full cycle
|
||
// is started again from the beginning. The selected reduced power limit stays
|
||
// active for the main benchmark and is restored by the caller afterwards.
|
||
func runBenchmarkPowerCalibration(
|
||
ctx context.Context,
|
||
verboseLog, runDir string,
|
||
gpuIndices []int,
|
||
infoByIndex map[int]benchmarkGPUInfo,
|
||
logFunc func(string),
|
||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
||
const calibDurationSec = 120
|
||
const maxDerateW = 150
|
||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||
const calibSearchTolerance = 10
|
||
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
||
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
||
// doubling each retry until it would exceed the cap, at which point the
|
||
// next busy response fails the calibration immediately.
|
||
const dcgmResourceBusyMaxDelaySec = 300
|
||
|
||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||
return map[int]benchmarkPowerCalibrationResult{}, nil
|
||
}
|
||
|
||
canDerate := os.Geteuid() == 0
|
||
if !canDerate {
|
||
logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled")
|
||
}
|
||
|
||
type calibrationAttemptResult struct {
|
||
out []byte
|
||
rows []GPUMetricRow
|
||
err error
|
||
}
|
||
|
||
|
||
// gpuCalibState holds per-GPU binary search state during parallel calibration.
|
||
type gpuCalibState struct {
|
||
idx int
|
||
info benchmarkGPUInfo
|
||
originalLimitW int
|
||
appliedLimitW int
|
||
minLimitW int
|
||
lo int // highest verified-stable limit (assumed: minLimitW)
|
||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||
calib benchmarkPowerCalibrationResult
|
||
converged bool
|
||
}
|
||
|
||
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
||
var restore []benchmarkRestoreAction
|
||
|
||
// Initialise per-GPU state.
|
||
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
||
for _, idx := range gpuIndices {
|
||
info := infoByIndex[idx]
|
||
originalLimitW := int(math.Round(info.PowerLimitW))
|
||
if originalLimitW <= 0 {
|
||
originalLimitW = int(math.Round(info.DefaultPowerLimitW))
|
||
}
|
||
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
|
||
if defaultLimitW <= 0 {
|
||
defaultLimitW = originalLimitW
|
||
}
|
||
appliedLimitW := originalLimitW
|
||
if appliedLimitW <= 0 {
|
||
appliedLimitW = defaultLimitW
|
||
}
|
||
minLimitW := appliedLimitW
|
||
switch {
|
||
case defaultLimitW > 0:
|
||
minLimitW = defaultLimitW - maxDerateW
|
||
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
|
||
if minLimitW < floorByRatio {
|
||
minLimitW = floorByRatio
|
||
}
|
||
case appliedLimitW > 0:
|
||
minLimitW = appliedLimitW - maxDerateW
|
||
}
|
||
if minLimitW < calibSearchTolerance {
|
||
minLimitW = calibSearchTolerance
|
||
}
|
||
s := &gpuCalibState{
|
||
idx: idx,
|
||
info: info,
|
||
originalLimitW: originalLimitW,
|
||
appliedLimitW: appliedLimitW,
|
||
minLimitW: minLimitW,
|
||
lo: minLimitW,
|
||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||
}
|
||
states = append(states, s)
|
||
if canDerate && originalLimitW > 0 {
|
||
idxCopy := idx
|
||
orig := originalLimitW
|
||
restore = append(restore, benchmarkRestoreAction{
|
||
name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy),
|
||
fn: func() {
|
||
_ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig)
|
||
},
|
||
})
|
||
}
|
||
}
|
||
|
||
// Shared DCGM resource-busy back-off state (single diagnostic session).
|
||
busyRetries := 0
|
||
busyDelaySec := 1
|
||
sharedAttempt := 0
|
||
|
||
type sharedAttemptResult struct {
|
||
out []byte
|
||
rows []GPUMetricRow
|
||
err error
|
||
}
|
||
|
||
calibDone:
|
||
for {
|
||
// Collect non-converged GPUs.
|
||
var active []*gpuCalibState
|
||
for _, s := range states {
|
||
if !s.converged {
|
||
active = append(active, s)
|
||
}
|
||
}
|
||
if len(active) == 0 || ctx.Err() != nil {
|
||
break
|
||
}
|
||
|
||
sharedAttempt++
|
||
for _, s := range active {
|
||
s.calib.Attempts++
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
||
}
|
||
|
||
// Snapshot throttle counters for all active GPUs before the run.
|
||
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
|
||
for _, s := range active {
|
||
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
||
}
|
||
|
||
// Run targeted_power for ALL gpuIndices simultaneously so every card
|
||
// is under load during calibration — this reflects real server thermals.
|
||
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||
doneCh := make(chan sharedAttemptResult, 1)
|
||
go func() {
|
||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
|
||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||
}()
|
||
|
||
ticker := time.NewTicker(time.Second)
|
||
throttleReasons := make(map[int]string, len(active))
|
||
var ar sharedAttemptResult
|
||
|
||
attemptLoop:
|
||
for {
|
||
select {
|
||
case ar = <-doneCh:
|
||
break attemptLoop
|
||
case <-ticker.C:
|
||
// Poll throttle counters for each active GPU independently.
|
||
for _, s := range active {
|
||
if throttleReasons[s.idx] != "" {
|
||
continue // already detected for this GPU
|
||
}
|
||
after, err := queryThrottleCounters(s.idx)
|
||
if err != nil {
|
||
continue
|
||
}
|
||
// Record throttle but do NOT cancel — let dcgmi finish so
|
||
// nv-hostengine releases the slot cleanly before the next attempt.
|
||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
||
throttleReasons[s.idx] = reason
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
||
}
|
||
}
|
||
case <-ctx.Done():
|
||
cancelAttempt()
|
||
ar = <-doneCh
|
||
break attemptLoop
|
||
}
|
||
}
|
||
ticker.Stop()
|
||
cancelAttempt()
|
||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||
|
||
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
||
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
||
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
||
for _, s := range active {
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
||
s.converged = true
|
||
}
|
||
logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
|
||
break calibDone
|
||
}
|
||
busyRetries++
|
||
// Undo attempt counter: busy retries don't count as real attempts.
|
||
for _, s := range active {
|
||
s.calib.Attempts--
|
||
}
|
||
logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
|
||
select {
|
||
case <-ctx.Done():
|
||
break calibDone
|
||
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
||
}
|
||
next := busyDelaySec * 2
|
||
if next > dcgmResourceBusyMaxDelaySec {
|
||
next = dcgmResourceBusyMaxDelaySec + 1
|
||
}
|
||
busyDelaySec = next
|
||
sharedAttempt-- // retry same logical attempt number
|
||
continue
|
||
}
|
||
busyRetries = 0
|
||
busyDelaySec = 1
|
||
|
||
// Per-GPU analysis and binary search update.
|
||
for _, s := range active {
|
||
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
||
summary := summarizeBenchmarkTelemetry(perGPU)
|
||
throttle := throttleReasons[s.idx]
|
||
|
||
// Cooling warning: thermal throttle with fans not at maximum.
|
||
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
||
clocks := make([]float64, 0, len(perGPU))
|
||
var fanDutyValues []float64
|
||
fanDutyAvail := false
|
||
for _, r := range perGPU {
|
||
if r.ClockMHz > 0 {
|
||
clocks = append(clocks, r.ClockMHz)
|
||
}
|
||
if r.FanDutyCycleAvailable {
|
||
fanDutyAvail = true
|
||
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
|
||
}
|
||
}
|
||
dropPct := benchmarkClockDrift(clocks)
|
||
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
|
||
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
|
||
s.calib.CoolingWarning = fmt.Sprintf(
|
||
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
|
||
throttle, dropPct, p95FanDuty,
|
||
)
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
|
||
}
|
||
}
|
||
|
||
if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
|
||
// Stable at current limit — update lo and binary-search upward.
|
||
s.calib.Summary = summary
|
||
s.calib.Completed = true
|
||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||
s.lo = s.appliedLimitW
|
||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||
next := roundTo5W((s.lo + s.hi) / 2)
|
||
if next > s.lo && next < s.hi {
|
||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
|
||
s.appliedLimitW = next
|
||
s.calib.AppliedPowerLimitW = float64(next)
|
||
s.calib.Completed = false // keep searching
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
|
||
continue // next GPU in active list
|
||
}
|
||
}
|
||
}
|
||
s.converged = true
|
||
continue
|
||
}
|
||
|
||
// Failed or throttled — log and binary-search downward.
|
||
switch {
|
||
case throttle != "":
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
||
case ar.err != nil:
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
|
||
default:
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
||
}
|
||
|
||
if !canDerate || s.appliedLimitW <= 0 {
|
||
s.converged = true
|
||
continue
|
||
}
|
||
s.hi = s.appliedLimitW
|
||
|
||
if s.hi-s.lo <= calibSearchTolerance {
|
||
if s.lo > s.minLimitW {
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||
s.appliedLimitW = s.lo
|
||
s.calib.AppliedPowerLimitW = float64(s.lo)
|
||
s.calib.Derated = s.lo < s.originalLimitW
|
||
}
|
||
} else {
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||
}
|
||
s.converged = true
|
||
continue
|
||
}
|
||
|
||
next := roundTo5W((s.lo + s.hi) / 2)
|
||
if next <= s.lo {
|
||
next = s.lo + calibSearchTolerance
|
||
}
|
||
if next >= s.hi {
|
||
next = (s.lo + s.hi) / 2
|
||
}
|
||
if next < s.minLimitW {
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||
s.converged = true
|
||
continue
|
||
}
|
||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
|
||
s.converged = true
|
||
continue
|
||
}
|
||
s.appliedLimitW = next
|
||
s.calib.AppliedPowerLimitW = float64(next)
|
||
s.calib.Derated = next < s.originalLimitW
|
||
s.info.PowerLimitW = float64(next)
|
||
infoByIndex[s.idx] = s.info
|
||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
||
}
|
||
}
|
||
|
||
for _, s := range states {
|
||
if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
|
||
results[s.idx] = s.calib
|
||
}
|
||
}
|
||
return results, restore
|
||
}
|
||
|
||
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||
// meaning nv-hostengine still holds the diagnostic slot from a prior run.
|
||
func isDCGMResourceBusy(err error) bool {
|
||
var exitErr *exec.ExitError
|
||
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
||
}
|
||
|
||
// roundTo5W rounds w to the nearest 5 W boundary.
|
||
func roundTo5W(w int) int {
|
||
return ((w + 2) / 5) * 5
|
||
}
|
||
|
||
func powerBenchDurationSec(profile string) int {
|
||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||
case NvidiaBenchmarkProfileStability:
|
||
return 300
|
||
case NvidiaBenchmarkProfileOvernight:
|
||
return 600
|
||
default:
|
||
return 120
|
||
}
|
||
}
|
||
|
||
|
||
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
||
out := make(map[int]benchmarkGPUInfo, len(src))
|
||
for k, v := range src {
|
||
out[k] = v
|
||
}
|
||
return out
|
||
}
|
||
|
||
func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||
var b strings.Builder
|
||
b.WriteString("# Bee Bench Power Report\n\n")
|
||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||
fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus)
|
||
if len(result.Findings) > 0 {
|
||
b.WriteString("## Summary\n\n")
|
||
for _, finding := range result.Findings {
|
||
fmt.Fprintf(&b, "- %s\n", finding)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
if len(result.RecommendedSlotOrder) > 0 {
|
||
b.WriteString("## Recommended Slot Order\n\n")
|
||
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||
}
|
||
if len(result.RampSteps) > 0 {
|
||
b.WriteString("## Ramp Sequence\n\n")
|
||
b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
|
||
b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
|
||
for _, step := range result.RampSteps {
|
||
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
|
||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
b.WriteString("## Per-Slot Results\n\n")
|
||
b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
|
||
b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
|
||
for _, gpu := range result.GPUs {
|
||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
|
||
gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
|
||
}
|
||
b.WriteString("\n")
|
||
for _, gpu := range result.GPUs {
|
||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||
|
||
for _, note := range gpu.Notes {
|
||
fmt.Fprintf(&b, "- %s\n", note)
|
||
}
|
||
b.WriteString("\n")
|
||
}
|
||
return b.String()
|
||
}
|
||
|
||
func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
||
var b strings.Builder
|
||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||
if len(result.RecommendedSlotOrder) > 0 {
|
||
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
|
||
}
|
||
for _, step := range result.RampSteps {
|
||
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
|
||
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
||
}
|
||
return b.String()
|
||
}
|
||
|
||
func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||
if ctx == nil {
|
||
ctx = context.Background()
|
||
}
|
||
if logFunc == nil {
|
||
logFunc = func(string) {}
|
||
}
|
||
if strings.TrimSpace(baseDir) == "" {
|
||
baseDir = "/var/log/bee-bench/power"
|
||
}
|
||
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
|
||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
if len(selected) == 0 {
|
||
return "", fmt.Errorf("no NVIDIA GPUs selected")
|
||
}
|
||
ts := time.Now().UTC().Format("20060102-150405")
|
||
runDir := filepath.Join(baseDir, "power-"+ts)
|
||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||
}
|
||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||
if infoErr != nil {
|
||
return "", infoErr
|
||
}
|
||
hostname, _ := os.Hostname()
|
||
result := NvidiaPowerBenchResult{
|
||
BenchmarkVersion: benchmarkVersion,
|
||
GeneratedAt: time.Now().UTC(),
|
||
Hostname: hostname,
|
||
ServerModel: readServerModel(),
|
||
BenchmarkProfile: opts.Profile,
|
||
SelectedGPUIndices: append([]int(nil), selected...),
|
||
OverallStatus: "OK",
|
||
}
|
||
durationSec := powerBenchDurationSec(opts.Profile)
|
||
_ = durationSec
|
||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||
// establish a true single-card power baseline unaffected by neighbour heat.
|
||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||
var allRestoreActions []benchmarkRestoreAction
|
||
for _, idx := range selected {
|
||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||
_ = os.MkdirAll(singleDir, 0755)
|
||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc)
|
||
allRestoreActions = append(allRestoreActions, restore...)
|
||
if r, ok := c[idx]; ok {
|
||
calibByIndex[idx] = r
|
||
}
|
||
}
|
||
defer func() {
|
||
for i := len(allRestoreActions) - 1; i >= 0; i-- {
|
||
allRestoreActions[i].fn()
|
||
}
|
||
}()
|
||
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
|
||
for _, idx := range selected {
|
||
info := infoByIndex[idx]
|
||
calib := calibByIndex[idx]
|
||
status := "OK"
|
||
if !calib.Completed {
|
||
status = "FAILED"
|
||
result.OverallStatus = "PARTIAL"
|
||
} else if calib.Derated {
|
||
status = "PARTIAL"
|
||
if result.OverallStatus == "OK" {
|
||
result.OverallStatus = "PARTIAL"
|
||
}
|
||
}
|
||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
||
Index: idx,
|
||
Name: info.Name,
|
||
BusID: info.BusID,
|
||
DefaultPowerLimitW: info.DefaultPowerLimitW,
|
||
AppliedPowerLimitW: calib.AppliedPowerLimitW,
|
||
MaxObservedPowerW: calib.Summary.P95PowerW,
|
||
MaxObservedTempC: calib.Summary.P95TempC,
|
||
CalibrationAttempts: calib.Attempts,
|
||
Derated: calib.Derated,
|
||
Status: status,
|
||
Notes: append([]string(nil), calib.Notes...),
|
||
CoolingWarning: calib.CoolingWarning,
|
||
})
|
||
}
|
||
sort.Slice(gpus, func(i, j int) bool {
|
||
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
||
return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
|
||
}
|
||
if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
|
||
return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
|
||
}
|
||
if gpus[i].Derated != gpus[j].Derated {
|
||
return !gpus[i].Derated
|
||
}
|
||
return gpus[i].Index < gpus[j].Index
|
||
})
|
||
result.GPUs = gpus
|
||
result.RecommendedSlotOrder = make([]int, 0, len(gpus))
|
||
for _, gpu := range gpus {
|
||
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
||
}
|
||
if len(result.RecommendedSlotOrder) > 0 {
|
||
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
|
||
}
|
||
for _, gpu := range gpus {
|
||
if gpu.Derated {
|
||
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
||
}
|
||
if gpu.CoolingWarning != "" {
|
||
result.Findings = append(result.Findings, fmt.Sprintf(
|
||
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
|
||
gpu.Index, gpu.CoolingWarning,
|
||
))
|
||
}
|
||
}
|
||
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
|
||
for _, gpu := range gpus {
|
||
singleByIndex[gpu.Index] = gpu
|
||
}
|
||
|
||
// Phase 2: ramp — add one GPU per step and calibrate the growing subset
|
||
// simultaneously. Step 1 reuses single-card results; steps 2..N run fresh
|
||
// targeted_power with derating if degradation is detected.
|
||
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
||
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||
_ = os.MkdirAll(stepDir, 0755)
|
||
var stepCalib map[int]benchmarkPowerCalibrationResult
|
||
if step == 1 {
|
||
// Single-GPU step — already measured in phase 1; reuse directly.
|
||
stepCalib = calibByIndex
|
||
logFunc(fmt.Sprintf("power ramp: step 1/%d — reusing single-card calibration for GPU %d", len(result.RecommendedSlotOrder), subset[0]))
|
||
} else {
|
||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||
var stepRestore []benchmarkRestoreAction
|
||
stepCalib, stepRestore = runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||
stepRestore[i].fn()
|
||
}
|
||
}
|
||
ramp := NvidiaPowerBenchStep{
|
||
StepIndex: step,
|
||
GPUIndices: subset,
|
||
Status: "OK",
|
||
}
|
||
var realizationValues []float64
|
||
for _, idx := range subset {
|
||
calib := stepCalib[idx]
|
||
ramp.TotalObservedPowerW += calib.Summary.P95PowerW
|
||
if calib.Derated {
|
||
ramp.DeratedGPUCount++
|
||
ramp.Status = "PARTIAL"
|
||
}
|
||
if !calib.Completed {
|
||
ramp.Status = "FAILED"
|
||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
|
||
continue
|
||
}
|
||
if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
|
||
realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
|
||
realizationValues = append(realizationValues, realization)
|
||
}
|
||
}
|
||
if len(subset) > 0 {
|
||
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
|
||
}
|
||
if len(realizationValues) > 0 {
|
||
ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
|
||
ramp.MinPowerRealizationPct = realizationValues[0]
|
||
for _, v := range realizationValues[1:] {
|
||
if v < ramp.MinPowerRealizationPct {
|
||
ramp.MinPowerRealizationPct = v
|
||
}
|
||
}
|
||
}
|
||
if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
|
||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
|
||
if result.OverallStatus == "OK" {
|
||
result.OverallStatus = "PARTIAL"
|
||
}
|
||
}
|
||
if ramp.DeratedGPUCount > 0 {
|
||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
|
||
}
|
||
result.RampSteps = append(result.RampSteps, ramp)
|
||
}
|
||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||
if err != nil {
|
||
return "", fmt.Errorf("marshal power result: %w", err)
|
||
}
|
||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||
return "", fmt.Errorf("write result.json: %w", err)
|
||
}
|
||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
|
||
return "", fmt.Errorf("write report.md: %w", err)
|
||
}
|
||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
|
||
return "", fmt.Errorf("write summary.txt: %w", err)
|
||
}
|
||
return runDir, nil
|
||
}
|