|
|
|
|
@@ -16,7 +16,7 @@ import (
|
|
|
|
|
"time"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const benchmarkVersion = "1"
|
|
|
|
|
const benchmarkVersion = "2"
|
|
|
|
|
|
|
|
|
|
type benchmarkProfileSpec struct {
|
|
|
|
|
Name string
|
|
|
|
|
@@ -41,6 +41,15 @@ type benchmarkGPUInfo struct {
|
|
|
|
|
MultiprocessorCount int
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type benchmarkPowerCalibrationResult struct {
|
|
|
|
|
Summary BenchmarkTelemetrySummary
|
|
|
|
|
AppliedPowerLimitW float64
|
|
|
|
|
Attempts int
|
|
|
|
|
Derated bool
|
|
|
|
|
Completed bool
|
|
|
|
|
Notes []string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type benchmarkBurnProfile struct {
|
|
|
|
|
name string
|
|
|
|
|
category string
|
|
|
|
|
@@ -78,7 +87,36 @@ var (
|
|
|
|
|
// to highest power draw so thermal ramp-up is gradual.
|
|
|
|
|
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
|
|
|
|
|
|
|
|
|
|
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
|
|
|
|
|
func computeCapabilityCode(raw string) int {
|
|
|
|
|
raw = strings.TrimSpace(raw)
|
|
|
|
|
if raw == "" {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
parts := strings.SplitN(raw, ".", 2)
|
|
|
|
|
major, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
|
|
|
minor := 0
|
|
|
|
|
if len(parts) > 1 {
|
|
|
|
|
minor, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
|
|
|
|
|
}
|
|
|
|
|
return major*10 + minor
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func benchmarkSupportedPrecisions(computeCapability string) []string {
|
|
|
|
|
cc := computeCapabilityCode(computeCapability)
|
|
|
|
|
out := make([]string, 0, len(benchmarkPrecisionPhases))
|
|
|
|
|
for _, prec := range benchmarkPrecisionPhases {
|
|
|
|
|
if prec == "fp4" && cc > 0 && cc < 100 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
out = append(out, prec)
|
|
|
|
|
}
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
|
|
|
|
|
if len(precisions) == 0 {
|
|
|
|
|
precisions = append([]string(nil), benchmarkPrecisionPhases...)
|
|
|
|
|
}
|
|
|
|
|
switch spec.Name {
|
|
|
|
|
case NvidiaBenchmarkProfileStandard:
|
|
|
|
|
basePhaseSec = 60
|
|
|
|
|
@@ -90,7 +128,7 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string
|
|
|
|
|
basePhaseSec = 3600
|
|
|
|
|
mixedPhaseSec = 14400
|
|
|
|
|
default:
|
|
|
|
|
totalWeight := len(benchmarkPrecisionPhases) + 5
|
|
|
|
|
totalWeight := len(precisions) + 5
|
|
|
|
|
if totalWeight <= 0 {
|
|
|
|
|
return nil, nil, 0, 0
|
|
|
|
|
}
|
|
|
|
|
@@ -100,9 +138,9 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string
|
|
|
|
|
}
|
|
|
|
|
mixedPhaseSec = basePhaseSec * 5
|
|
|
|
|
}
|
|
|
|
|
planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1)
|
|
|
|
|
planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1)
|
|
|
|
|
for _, prec := range benchmarkPrecisionPhases {
|
|
|
|
|
planLabels = make([]string, 0, len(precisions)+1)
|
|
|
|
|
planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1)
|
|
|
|
|
for _, prec := range precisions {
|
|
|
|
|
planLabels = append(planLabels, prec)
|
|
|
|
|
planPhases = append(planPhases, benchmarkPlannedPhase{
|
|
|
|
|
PlanLabel: prec,
|
|
|
|
|
@@ -127,6 +165,53 @@ func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
|
|
|
|
|
return strings.Join(values, ",")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func benchmarkPlannedPhaseStatus(raw []byte) (string, string) {
|
|
|
|
|
text := strings.ToLower(strings.TrimSpace(string(raw)))
|
|
|
|
|
switch {
|
|
|
|
|
case text == "":
|
|
|
|
|
return "FAILED", "phase produced no output"
|
|
|
|
|
case strings.Contains(text, "phase_error="):
|
|
|
|
|
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") {
|
|
|
|
|
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
|
|
|
|
|
}
|
|
|
|
|
return "FAILED", "precision phase failed"
|
|
|
|
|
case strings.Contains(text, "status=failed"):
|
|
|
|
|
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") {
|
|
|
|
|
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
|
|
|
|
|
}
|
|
|
|
|
return "FAILED", "precision phase failed"
|
|
|
|
|
default:
|
|
|
|
|
return "OK", ""
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string {
|
|
|
|
|
diff := diffThrottleCounters(before, after)
|
|
|
|
|
switch {
|
|
|
|
|
case diff.HWThermalSlowdownUS > 0:
|
|
|
|
|
return "hw_thermal"
|
|
|
|
|
case diff.SWThermalSlowdownUS > 0:
|
|
|
|
|
return "sw_thermal"
|
|
|
|
|
case diff.HWPowerBrakeSlowdownUS > 0:
|
|
|
|
|
return "hw_power_brake"
|
|
|
|
|
default:
|
|
|
|
|
return ""
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error {
|
|
|
|
|
if powerLimitW <= 0 {
|
|
|
|
|
return fmt.Errorf("invalid power limit %d", powerLimitW)
|
|
|
|
|
}
|
|
|
|
|
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{
|
|
|
|
|
"nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW),
|
|
|
|
|
}, nil, nil)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out)))
|
|
|
|
|
}
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
|
|
|
|
if ctx == nil {
|
|
|
|
|
ctx = context.Background()
|
|
|
|
|
@@ -135,7 +220,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
logFunc = func(string) {}
|
|
|
|
|
}
|
|
|
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
|
|
|
baseDir = "/var/log/bee-benchmark"
|
|
|
|
|
baseDir = "/var/log/bee-bench/perf"
|
|
|
|
|
}
|
|
|
|
|
spec := resolveBenchmarkProfile(opts.Profile)
|
|
|
|
|
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
|
|
|
|
|
@@ -149,7 +234,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ts := time.Now().UTC().Format("20060102-150405")
|
|
|
|
|
runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts)
|
|
|
|
|
runDir := filepath.Join(baseDir, "perf-"+ts)
|
|
|
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
|
|
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
|
|
|
|
}
|
|
|
|
|
@@ -175,6 +260,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
|
|
|
|
|
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
|
|
|
|
var metricRows []GPUMetricRow
|
|
|
|
|
metricTimelineSec := 0.0
|
|
|
|
|
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
|
|
|
|
|
|
|
|
|
|
// Server power characterization state — populated during per-GPU phases.
|
|
|
|
|
@@ -215,14 +301,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
|
|
|
|
|
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
|
|
|
|
|
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
|
|
|
|
|
calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc)
|
|
|
|
|
calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
|
|
|
|
restoreActions = append(restoreActions, powerRestoreActions...)
|
|
|
|
|
for _, idx := range selected {
|
|
|
|
|
if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 {
|
|
|
|
|
result.Warnings = append(result.Warnings, fmt.Sprintf(
|
|
|
|
|
"GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.",
|
|
|
|
|
idx, calib.AppliedPowerLimitW,
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Start background CPU load sampler — samples every 10s during GPU phases.
|
|
|
|
|
cpuStopCh := make(chan struct{})
|
|
|
|
|
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
|
|
|
|
|
|
|
|
|
|
if opts.ParallelGPUs {
|
|
|
|
|
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog)
|
|
|
|
|
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog)
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
|
|
for _, idx := range selected {
|
|
|
|
|
@@ -242,8 +337,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
|
|
|
|
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
|
|
|
|
}
|
|
|
|
|
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
|
|
|
|
|
gpuResult.CalibratedPeakPowerW = w
|
|
|
|
|
if calib, ok := calibByIndex[idx]; ok {
|
|
|
|
|
gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW
|
|
|
|
|
gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC
|
|
|
|
|
gpuResult.PowerCalibrationTries = calib.Attempts
|
|
|
|
|
gpuResult.PowerLimitDerated = calib.Derated
|
|
|
|
|
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
|
|
|
|
|
}
|
|
|
|
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
|
|
|
|
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
|
|
|
|
@@ -255,7 +354,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
|
|
|
|
|
}
|
|
|
|
|
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx))
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec))
|
|
|
|
|
|
|
|
|
|
// Sample server idle power once (first GPU only — server state is global).
|
|
|
|
|
if !serverIdleOK {
|
|
|
|
|
@@ -274,18 +373,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
}
|
|
|
|
|
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
|
|
|
|
|
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx))
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec))
|
|
|
|
|
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
|
|
|
|
|
if warmupErr != nil {
|
|
|
|
|
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
|
|
|
|
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
warmupParse := parseBenchmarkBurnLog(string(warmupOut))
|
|
|
|
|
if gpuResult.ComputeCapability == "" {
|
|
|
|
|
gpuResult.ComputeCapability = warmupParse.ComputeCapability
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Run synthetic precision phases and the combined steady phase as one
|
|
|
|
|
// uninterrupted command so the GPU stays hot between windows.
|
|
|
|
|
eccBase, _ := queryECCCounters(idx)
|
|
|
|
|
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
|
|
|
|
|
supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability)
|
|
|
|
|
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
|
|
|
|
|
if label == "mixed" {
|
|
|
|
|
return fmt.Sprintf("gpu-%d-steady", idx)
|
|
|
|
|
}
|
|
|
|
|
@@ -299,24 +403,27 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
"--precision-plan", strings.Join(planLabels, ","),
|
|
|
|
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
|
|
|
|
}
|
|
|
|
|
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
|
|
|
|
|
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
|
|
|
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
|
|
|
|
for _, phaseSpec := range planPhases {
|
|
|
|
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage)
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
|
|
|
|
|
}
|
|
|
|
|
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
|
|
|
|
|
}
|
|
|
|
|
for _, prec := range benchmarkPrecisionPhases {
|
|
|
|
|
for _, prec := range supportedPrecisions {
|
|
|
|
|
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
|
|
|
|
|
phaseRows := phaseRowsByStage[stageName]
|
|
|
|
|
if len(phaseRows) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
phase := BenchmarkPrecisionSteadyPhase{
|
|
|
|
|
Precision: prec,
|
|
|
|
|
Status: "OK",
|
|
|
|
|
Steady: summarizeBenchmarkTelemetry(phaseRows),
|
|
|
|
|
}
|
|
|
|
|
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
|
|
|
|
|
phase.Status = status
|
|
|
|
|
phase.Notes = note
|
|
|
|
|
gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status)
|
|
|
|
|
}
|
|
|
|
|
for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
|
|
|
|
|
if p.Supported {
|
|
|
|
|
phase.TeraOpsPerSec += p.TeraOpsPerSec
|
|
|
|
|
@@ -396,13 +503,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
|
|
|
|
|
}
|
|
|
|
|
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
|
|
|
|
|
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
|
|
|
|
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
|
|
|
|
if planErr != nil {
|
|
|
|
|
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
|
|
|
|
} else if len(gpuResult.PrecisionFailures) > 0 {
|
|
|
|
|
gpuResult.Status = "PARTIAL"
|
|
|
|
|
} else if parseResult.Fallback {
|
|
|
|
|
gpuResult.Status = "PARTIAL"
|
|
|
|
|
} else {
|
|
|
|
|
@@ -929,35 +1038,34 @@ func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow {
|
|
|
|
|
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow {
|
|
|
|
|
if len(rows) == 0 {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
stageEnd := offset + durationSec
|
|
|
|
|
if stageEnd <= offset {
|
|
|
|
|
stageEnd = offset
|
|
|
|
|
for _, row := range rows {
|
|
|
|
|
if row.ElapsedSec+offset > stageEnd {
|
|
|
|
|
stageEnd = row.ElapsedSec + offset
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
out := make([]GPUMetricRow, len(rows))
|
|
|
|
|
for i, row := range rows {
|
|
|
|
|
row.Stage = stage
|
|
|
|
|
row.ElapsedSec += offset
|
|
|
|
|
row.StageStartSec = offset
|
|
|
|
|
row.StageEndSec = stageEnd
|
|
|
|
|
out[i] = row
|
|
|
|
|
}
|
|
|
|
|
return out
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func benchmarkMetricOffset(rows []GPUMetricRow) float64 {
|
|
|
|
|
if len(rows) == 0 {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
var maxElapsed float64
|
|
|
|
|
for _, row := range rows {
|
|
|
|
|
if row.ElapsedSec > maxElapsed {
|
|
|
|
|
maxElapsed = row.ElapsedSec
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return maxElapsed
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) {
|
|
|
|
|
annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows))
|
|
|
|
|
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) {
|
|
|
|
|
annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec)
|
|
|
|
|
*allRows = append(*allRows, annotated...)
|
|
|
|
|
*cursor += durationSec
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
|
|
|
|
|
@@ -1308,6 +1416,9 @@ func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStat
|
|
|
|
|
if normalizationStatus != "full" {
|
|
|
|
|
reasons = append(reasons, "normalization_partial")
|
|
|
|
|
}
|
|
|
|
|
if gpu.PowerLimitDerated {
|
|
|
|
|
reasons = append(reasons, "power_limit_derated")
|
|
|
|
|
}
|
|
|
|
|
if gpu.ECC.Uncorrected > 0 {
|
|
|
|
|
reasons = append(reasons, "ecc_uncorrected_errors")
|
|
|
|
|
}
|
|
|
|
|
@@ -1522,12 +1633,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
|
|
|
|
|
case "normalization_partial":
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
|
|
|
|
|
case "power_limit_derated":
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW))
|
|
|
|
|
case "ecc_uncorrected_errors":
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
|
|
|
|
|
case "ecc_corrected_errors":
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if len(gpu.PrecisionFailures) > 0 {
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
|
|
|
|
|
}
|
|
|
|
|
if gpu.Backend == "driver-ptx" {
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
|
|
|
|
}
|
|
|
|
|
@@ -1896,10 +2012,11 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
spec benchmarkProfileSpec,
|
|
|
|
|
logFunc func(string),
|
|
|
|
|
result *NvidiaBenchmarkResult,
|
|
|
|
|
calibPowerByIndex map[int]float64,
|
|
|
|
|
calibByIndex map[int]benchmarkPowerCalibrationResult,
|
|
|
|
|
serverIdleW *float64, serverLoadedWSum *float64,
|
|
|
|
|
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
|
|
|
|
allMetricRows *[]GPUMetricRow,
|
|
|
|
|
metricTimelineSec *float64,
|
|
|
|
|
gpuBurnLog string,
|
|
|
|
|
) {
|
|
|
|
|
allDevices := joinIndexList(selected)
|
|
|
|
|
@@ -1920,8 +2037,12 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
|
|
|
|
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
|
|
|
|
}
|
|
|
|
|
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
|
|
|
|
|
r.CalibratedPeakPowerW = w
|
|
|
|
|
if calib, ok := calibByIndex[idx]; ok {
|
|
|
|
|
r.CalibratedPeakPowerW = calib.Summary.P95PowerW
|
|
|
|
|
r.CalibratedPeakTempC = calib.Summary.P95TempC
|
|
|
|
|
r.PowerCalibrationTries = calib.Attempts
|
|
|
|
|
r.PowerLimitDerated = calib.Derated
|
|
|
|
|
r.Notes = append(r.Notes, calib.Notes...)
|
|
|
|
|
}
|
|
|
|
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
|
|
|
|
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
|
|
|
|
@@ -1941,7 +2062,7 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
perGPU := filterRowsByGPU(baselineRows, idx)
|
|
|
|
|
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
|
|
|
|
|
}
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline")
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec))
|
|
|
|
|
|
|
|
|
|
// Sample server idle power once.
|
|
|
|
|
if !*serverIdleOK {
|
|
|
|
|
@@ -1961,13 +2082,25 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
}
|
|
|
|
|
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
|
|
|
|
|
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup")
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec))
|
|
|
|
|
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
|
|
|
|
|
if warmupErr != nil {
|
|
|
|
|
for _, idx := range selected {
|
|
|
|
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut))
|
|
|
|
|
supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...)
|
|
|
|
|
for _, idx := range selected {
|
|
|
|
|
if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" {
|
|
|
|
|
if gpuResults[idx].ComputeCapability == "" {
|
|
|
|
|
gpuResults[idx].ComputeCapability = pr.ComputeCapability
|
|
|
|
|
}
|
|
|
|
|
if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) {
|
|
|
|
|
supportedPrecisions = ccPrecisions
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Run synthetic precision phases and the combined steady phase as one
|
|
|
|
|
// uninterrupted command so the GPUs stay hot between windows.
|
|
|
|
|
@@ -1975,7 +2108,7 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
for _, idx := range selected {
|
|
|
|
|
eccBase[idx], _ = queryECCCounters(idx)
|
|
|
|
|
}
|
|
|
|
|
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
|
|
|
|
|
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
|
|
|
|
|
if label == "mixed" {
|
|
|
|
|
return "steady"
|
|
|
|
|
}
|
|
|
|
|
@@ -1989,30 +2122,30 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
"--precision-plan", strings.Join(planLabels, ","),
|
|
|
|
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
|
|
|
|
}
|
|
|
|
|
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
|
|
|
|
|
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
|
|
|
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
|
|
|
|
for _, phaseSpec := range planPhases {
|
|
|
|
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage)
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
|
|
|
|
|
}
|
|
|
|
|
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
|
|
|
|
|
}
|
|
|
|
|
for _, prec := range benchmarkPrecisionPhases {
|
|
|
|
|
for _, prec := range supportedPrecisions {
|
|
|
|
|
phaseLogName := "gpu-all-steady-" + prec
|
|
|
|
|
phaseRows := phaseRowsByStage[phaseLogName]
|
|
|
|
|
if len(phaseRows) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
|
|
|
|
|
for _, idx := range selected {
|
|
|
|
|
perGPU := filterRowsByGPU(phaseRows, idx)
|
|
|
|
|
if len(perGPU) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
phase := BenchmarkPrecisionSteadyPhase{
|
|
|
|
|
Precision: prec,
|
|
|
|
|
Status: "OK",
|
|
|
|
|
Steady: summarizeBenchmarkTelemetry(perGPU),
|
|
|
|
|
}
|
|
|
|
|
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
|
|
|
|
|
phase.Status = status
|
|
|
|
|
phase.Notes = note
|
|
|
|
|
gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status)
|
|
|
|
|
}
|
|
|
|
|
if pr, ok := parseByGPU[idx]; ok {
|
|
|
|
|
for _, p := range pr.Profiles {
|
|
|
|
|
if p.Supported {
|
|
|
|
|
@@ -2113,7 +2246,7 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
perGPU := filterRowsByGPU(cooldownRows, idx)
|
|
|
|
|
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
|
|
|
|
}
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
|
|
|
|
|
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Score and finalize each GPU.
|
|
|
|
|
@@ -2125,6 +2258,8 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
switch {
|
|
|
|
|
case planErr != nil:
|
|
|
|
|
r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
|
|
|
|
case len(r.PrecisionFailures) > 0:
|
|
|
|
|
r.Status = "PARTIAL"
|
|
|
|
|
case pr.Fallback:
|
|
|
|
|
r.Status = "PARTIAL"
|
|
|
|
|
default:
|
|
|
|
|
@@ -2299,59 +2434,172 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
|
|
|
|
return cl
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while
|
|
|
|
|
// collecting nvidia-smi power samples in parallel. It returns a map from GPU
|
|
|
|
|
// index to p95 observed power (watts), which is used as the reference for
|
|
|
|
|
// PowerSustainScore instead of the hardware default limit.
|
|
|
|
|
//
|
|
|
|
|
// If dcgmi is unavailable or the run fails the function returns an empty map
|
|
|
|
|
// and the caller falls back to DefaultPowerLimitW. The calibration is skipped
|
|
|
|
|
// gracefully — it must never block or fail the main benchmark.
|
|
|
|
|
// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
|
|
|
|
|
// throttle counters. If a GPU starts throttling, the current targeted_power run
|
|
|
|
|
// is canceled immediately, the power limit is reduced, and a fresh full cycle
|
|
|
|
|
// is started again from the beginning. The selected reduced power limit stays
|
|
|
|
|
// active for the main benchmark and is restored by the caller afterwards.
|
|
|
|
|
func runBenchmarkPowerCalibration(
|
|
|
|
|
ctx context.Context,
|
|
|
|
|
verboseLog, runDir string,
|
|
|
|
|
gpuIndices []int,
|
|
|
|
|
infoByIndex map[int]benchmarkGPUInfo,
|
|
|
|
|
logFunc func(string),
|
|
|
|
|
) map[int]float64 {
|
|
|
|
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
|
|
|
|
const calibDurationSec = 120
|
|
|
|
|
const derateStepW = 25
|
|
|
|
|
const maxDerateW = 150
|
|
|
|
|
|
|
|
|
|
// dcgmi must be present.
|
|
|
|
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
|
|
|
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
|
|
|
|
return map[int]float64{}
|
|
|
|
|
return map[int]benchmarkPowerCalibrationResult{}, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
|
|
|
|
|
|
|
|
|
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
|
|
|
|
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc)
|
|
|
|
|
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
|
|
|
|
|
return map[int]float64{}
|
|
|
|
|
canDerate := os.Geteuid() == 0
|
|
|
|
|
if !canDerate {
|
|
|
|
|
logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Group rows by GPU index and compute p95 power for each.
|
|
|
|
|
result := make(map[int]float64, len(gpuIndices))
|
|
|
|
|
type calibrationAttemptResult struct {
|
|
|
|
|
out []byte
|
|
|
|
|
rows []GPUMetricRow
|
|
|
|
|
err error
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
|
|
|
|
var restore []benchmarkRestoreAction
|
|
|
|
|
for _, idx := range gpuIndices {
|
|
|
|
|
perGPU := filterRowsByGPU(rows, idx)
|
|
|
|
|
if len(perGPU) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
info := infoByIndex[idx]
|
|
|
|
|
originalLimitW := int(math.Round(info.PowerLimitW))
|
|
|
|
|
if originalLimitW <= 0 {
|
|
|
|
|
originalLimitW = int(math.Round(info.DefaultPowerLimitW))
|
|
|
|
|
}
|
|
|
|
|
powers := make([]float64, 0, len(perGPU))
|
|
|
|
|
for _, r := range perGPU {
|
|
|
|
|
if r.PowerW > 0 {
|
|
|
|
|
powers = append(powers, r.PowerW)
|
|
|
|
|
defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
|
|
|
|
|
if defaultLimitW <= 0 {
|
|
|
|
|
defaultLimitW = originalLimitW
|
|
|
|
|
}
|
|
|
|
|
appliedLimitW := originalLimitW
|
|
|
|
|
if appliedLimitW <= 0 {
|
|
|
|
|
appliedLimitW = defaultLimitW
|
|
|
|
|
}
|
|
|
|
|
minLimitW := appliedLimitW
|
|
|
|
|
switch {
|
|
|
|
|
case defaultLimitW > 0:
|
|
|
|
|
minLimitW = defaultLimitW - maxDerateW
|
|
|
|
|
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
|
|
|
|
|
if minLimitW < floorByRatio {
|
|
|
|
|
minLimitW = floorByRatio
|
|
|
|
|
}
|
|
|
|
|
case appliedLimitW > 0:
|
|
|
|
|
minLimitW = appliedLimitW - maxDerateW
|
|
|
|
|
}
|
|
|
|
|
if len(powers) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
if minLimitW < derateStepW {
|
|
|
|
|
minLimitW = derateStepW
|
|
|
|
|
}
|
|
|
|
|
p95 := benchmarkPercentile(powers, 95)
|
|
|
|
|
if p95 > 0 {
|
|
|
|
|
result[idx] = p95
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers)))
|
|
|
|
|
|
|
|
|
|
calib := benchmarkPowerCalibrationResult{
|
|
|
|
|
AppliedPowerLimitW: float64(appliedLimitW),
|
|
|
|
|
}
|
|
|
|
|
if canDerate && originalLimitW > 0 {
|
|
|
|
|
idxCopy := idx
|
|
|
|
|
orig := originalLimitW
|
|
|
|
|
restore = append(restore, benchmarkRestoreAction{
|
|
|
|
|
name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy),
|
|
|
|
|
fn: func() {
|
|
|
|
|
_ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig)
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
calib.Attempts++
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
|
|
|
|
|
|
|
|
|
|
beforeThrottle, _ := queryThrottleCounters(idx)
|
|
|
|
|
attemptCtx, cancel := context.WithCancel(ctx)
|
|
|
|
|
doneCh := make(chan calibrationAttemptResult, 1)
|
|
|
|
|
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
|
|
|
|
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
|
|
|
|
|
go func() {
|
|
|
|
|
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
|
|
|
|
|
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
ticker := time.NewTicker(time.Second)
|
|
|
|
|
var (
|
|
|
|
|
attempt calibrationAttemptResult
|
|
|
|
|
throttleReason string
|
|
|
|
|
)
|
|
|
|
|
attemptLoop:
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case attempt = <-doneCh:
|
|
|
|
|
break attemptLoop
|
|
|
|
|
case <-ticker.C:
|
|
|
|
|
afterThrottle, err := queryThrottleCounters(idx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
|
|
|
|
|
throttleReason = reason
|
|
|
|
|
cancel()
|
|
|
|
|
}
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
cancel()
|
|
|
|
|
attempt = <-doneCh
|
|
|
|
|
break attemptLoop
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
ticker.Stop()
|
|
|
|
|
cancel()
|
|
|
|
|
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
|
|
|
|
|
|
|
|
|
|
perGPU := filterRowsByGPU(attempt.rows, idx)
|
|
|
|
|
summary := summarizeBenchmarkTelemetry(perGPU)
|
|
|
|
|
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
|
|
|
|
|
calib.Summary = summary
|
|
|
|
|
calib.Completed = true
|
|
|
|
|
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch {
|
|
|
|
|
case throttleReason != "":
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
|
|
|
|
|
case attempt.err != nil:
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
|
|
|
|
|
default:
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !canDerate || appliedLimitW <= 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
nextLimitW := appliedLimitW - derateStepW
|
|
|
|
|
if nextLimitW < minLimitW {
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
|
|
|
|
|
calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
appliedLimitW = nextLimitW
|
|
|
|
|
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
|
|
|
|
calib.Derated = true
|
|
|
|
|
info.PowerLimitW = float64(appliedLimitW)
|
|
|
|
|
infoByIndex[idx] = info
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
|
|
|
|
|
results[idx] = calib
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return result
|
|
|
|
|
return results, restore
|
|
|
|
|
}
|
|
|
|
|
|