|
|
|
|
@@ -49,6 +49,10 @@ type benchmarkPowerCalibrationResult struct {
|
|
|
|
|
Derated bool
|
|
|
|
|
Completed bool
|
|
|
|
|
Notes []string
|
|
|
|
|
// CoolingWarning is set when the GPU throttled thermally with a clock drop
|
|
|
|
|
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
|
|
|
|
// cooling system may not be correctly configured for full GPU load.
|
|
|
|
|
CoolingWarning string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type benchmarkBurnProfile struct {
|
|
|
|
|
@@ -344,6 +348,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|
|
|
|
gpuResult.PowerCalibrationTries = calib.Attempts
|
|
|
|
|
gpuResult.PowerLimitDerated = calib.Derated
|
|
|
|
|
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
|
|
|
|
|
if calib.CoolingWarning != "" {
|
|
|
|
|
gpuResult.CoolingWarning = calib.CoolingWarning
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
|
|
|
|
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
|
|
|
|
@@ -1625,7 +1632,15 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|
|
|
|
case "power_capped":
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
|
|
|
|
|
case "thermal_limited":
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
|
|
|
|
|
msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)
|
|
|
|
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable &&
|
|
|
|
|
result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 {
|
|
|
|
|
msg += fmt.Sprintf(
|
|
|
|
|
" Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.",
|
|
|
|
|
result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct,
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
findings = append(findings, msg)
|
|
|
|
|
case "sync_boost_limited":
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
|
|
|
|
|
case "low_sm_clock_vs_target":
|
|
|
|
|
@@ -1642,6 +1657,12 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if gpu.CoolingWarning != "" {
|
|
|
|
|
findings = append(findings, fmt.Sprintf(
|
|
|
|
|
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
|
|
|
|
|
gpu.Index, gpu.CoolingWarning,
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
if len(gpu.PrecisionFailures) > 0 {
|
|
|
|
|
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
|
|
|
|
|
}
|
|
|
|
|
@@ -2044,6 +2065,9 @@ func runNvidiaBenchmarkParallel(
|
|
|
|
|
r.PowerCalibrationTries = calib.Attempts
|
|
|
|
|
r.PowerLimitDerated = calib.Derated
|
|
|
|
|
r.Notes = append(r.Notes, calib.Notes...)
|
|
|
|
|
if calib.CoolingWarning != "" {
|
|
|
|
|
r.CoolingWarning = calib.CoolingWarning
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
|
|
|
|
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
|
|
|
|
@@ -2448,8 +2472,10 @@ func runBenchmarkPowerCalibration(
|
|
|
|
|
logFunc func(string),
|
|
|
|
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
|
|
|
|
const calibDurationSec = 120
|
|
|
|
|
const derateStepW = 25
|
|
|
|
|
const maxDerateW = 150
|
|
|
|
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
|
|
|
|
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
|
|
|
|
const calibSearchTolerance = 10
|
|
|
|
|
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
|
|
|
|
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
|
|
|
|
// doubling each retry until it would exceed the cap, at which point the
|
|
|
|
|
@@ -2472,8 +2498,25 @@ func runBenchmarkPowerCalibration(
|
|
|
|
|
err error
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// gpuCalibState holds per-GPU binary search state during parallel calibration.
|
|
|
|
|
type gpuCalibState struct {
|
|
|
|
|
idx int
|
|
|
|
|
info benchmarkGPUInfo
|
|
|
|
|
originalLimitW int
|
|
|
|
|
appliedLimitW int
|
|
|
|
|
minLimitW int
|
|
|
|
|
lo int // highest verified-stable limit (assumed: minLimitW)
|
|
|
|
|
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
|
|
|
|
calib benchmarkPowerCalibrationResult
|
|
|
|
|
converged bool
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
|
|
|
|
var restore []benchmarkRestoreAction
|
|
|
|
|
|
|
|
|
|
// Initialise per-GPU state.
|
|
|
|
|
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
|
|
|
|
for _, idx := range gpuIndices {
|
|
|
|
|
info := infoByIndex[idx]
|
|
|
|
|
originalLimitW := int(math.Round(info.PowerLimitW))
|
|
|
|
|
@@ -2499,15 +2542,20 @@ func runBenchmarkPowerCalibration(
|
|
|
|
|
case appliedLimitW > 0:
|
|
|
|
|
minLimitW = appliedLimitW - maxDerateW
|
|
|
|
|
}
|
|
|
|
|
if minLimitW < derateStepW {
|
|
|
|
|
minLimitW = derateStepW
|
|
|
|
|
if minLimitW < calibSearchTolerance {
|
|
|
|
|
minLimitW = calibSearchTolerance
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
calib := benchmarkPowerCalibrationResult{
|
|
|
|
|
AppliedPowerLimitW: float64(appliedLimitW),
|
|
|
|
|
s := &gpuCalibState{
|
|
|
|
|
idx: idx,
|
|
|
|
|
info: info,
|
|
|
|
|
originalLimitW: originalLimitW,
|
|
|
|
|
appliedLimitW: appliedLimitW,
|
|
|
|
|
minLimitW: minLimitW,
|
|
|
|
|
lo: minLimitW,
|
|
|
|
|
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
|
|
|
|
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
|
|
|
|
}
|
|
|
|
|
busyRetries := 0
|
|
|
|
|
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
|
|
|
|
|
states = append(states, s)
|
|
|
|
|
if canDerate && originalLimitW > 0 {
|
|
|
|
|
idxCopy := idx
|
|
|
|
|
orig := originalLimitW
|
|
|
|
|
@@ -2518,125 +2566,243 @@ func runBenchmarkPowerCalibration(
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
calibLoop:
|
|
|
|
|
// Shared DCGM resource-busy back-off state (single diagnostic session).
|
|
|
|
|
busyRetries := 0
|
|
|
|
|
busyDelaySec := 1
|
|
|
|
|
sharedAttempt := 0
|
|
|
|
|
|
|
|
|
|
type sharedAttemptResult struct {
|
|
|
|
|
out []byte
|
|
|
|
|
rows []GPUMetricRow
|
|
|
|
|
err error
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
calibDone:
|
|
|
|
|
for {
|
|
|
|
|
// Collect non-converged GPUs.
|
|
|
|
|
var active []*gpuCalibState
|
|
|
|
|
for _, s := range states {
|
|
|
|
|
if !s.converged {
|
|
|
|
|
active = append(active, s)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if len(active) == 0 || ctx.Err() != nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sharedAttempt++
|
|
|
|
|
for _, s := range active {
|
|
|
|
|
s.calib.Attempts++
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Snapshot throttle counters for all active GPUs before the run.
|
|
|
|
|
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
|
|
|
|
|
for _, s := range active {
|
|
|
|
|
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Run targeted_power for ALL gpuIndices simultaneously so every card
|
|
|
|
|
// is under load during calibration — this reflects real server thermals.
|
|
|
|
|
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
|
|
|
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
|
|
|
|
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
|
|
|
|
doneCh := make(chan sharedAttemptResult, 1)
|
|
|
|
|
go func() {
|
|
|
|
|
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
|
|
|
|
|
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
ticker := time.NewTicker(time.Second)
|
|
|
|
|
throttleReasons := make(map[int]string, len(active))
|
|
|
|
|
var ar sharedAttemptResult
|
|
|
|
|
|
|
|
|
|
attemptLoop:
|
|
|
|
|
for {
|
|
|
|
|
calib.Attempts++
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
|
|
|
|
|
|
|
|
|
|
beforeThrottle, _ := queryThrottleCounters(idx)
|
|
|
|
|
attemptCtx, cancel := context.WithCancel(ctx)
|
|
|
|
|
doneCh := make(chan calibrationAttemptResult, 1)
|
|
|
|
|
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
|
|
|
|
|
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
|
|
|
|
|
go func() {
|
|
|
|
|
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
|
|
|
|
|
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
|
|
|
|
|
}()
|
|
|
|
|
|
|
|
|
|
ticker := time.NewTicker(time.Second)
|
|
|
|
|
var (
|
|
|
|
|
attempt calibrationAttemptResult
|
|
|
|
|
throttleReason string
|
|
|
|
|
)
|
|
|
|
|
attemptLoop:
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case attempt = <-doneCh:
|
|
|
|
|
break attemptLoop
|
|
|
|
|
case <-ticker.C:
|
|
|
|
|
afterThrottle, err := queryThrottleCounters(idx)
|
|
|
|
|
select {
|
|
|
|
|
case ar = <-doneCh:
|
|
|
|
|
break attemptLoop
|
|
|
|
|
case <-ticker.C:
|
|
|
|
|
// Poll throttle counters for each active GPU independently.
|
|
|
|
|
for _, s := range active {
|
|
|
|
|
if throttleReasons[s.idx] != "" {
|
|
|
|
|
continue // already detected for this GPU
|
|
|
|
|
}
|
|
|
|
|
after, err := queryThrottleCounters(s.idx)
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
|
|
|
|
|
throttleReason = reason
|
|
|
|
|
cancel()
|
|
|
|
|
// Record throttle but do NOT cancel — let dcgmi finish so
|
|
|
|
|
// nv-hostengine releases the slot cleanly before the next attempt.
|
|
|
|
|
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
|
|
|
|
throttleReasons[s.idx] = reason
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
|
|
|
|
}
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
cancel()
|
|
|
|
|
attempt = <-doneCh
|
|
|
|
|
break attemptLoop
|
|
|
|
|
}
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
cancelAttempt()
|
|
|
|
|
ar = <-doneCh
|
|
|
|
|
break attemptLoop
|
|
|
|
|
}
|
|
|
|
|
ticker.Stop()
|
|
|
|
|
cancel()
|
|
|
|
|
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
|
|
|
|
|
|
|
|
|
|
perGPU := filterRowsByGPU(attempt.rows, idx)
|
|
|
|
|
summary := summarizeBenchmarkTelemetry(perGPU)
|
|
|
|
|
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
|
|
|
|
|
calib.Summary = summary
|
|
|
|
|
calib.Completed = true
|
|
|
|
|
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If DCGM reports the resource is in use, nv-hostengine has not yet
|
|
|
|
|
// released the diagnostic slot from the previous attempt. Do not
|
|
|
|
|
// derate: wait with exponential back-off and retry at the same
|
|
|
|
|
// power limit. Once the back-off delay would exceed
|
|
|
|
|
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
|
|
|
|
|
// held by something else.
|
|
|
|
|
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
|
|
|
|
|
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
busyRetries++
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
break calibLoop
|
|
|
|
|
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
|
|
|
|
}
|
|
|
|
|
next := busyDelaySec * 2
|
|
|
|
|
if next > dcgmResourceBusyMaxDelaySec {
|
|
|
|
|
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
|
|
|
|
|
}
|
|
|
|
|
busyDelaySec = next
|
|
|
|
|
continue calibLoop
|
|
|
|
|
}
|
|
|
|
|
busyRetries = 0 // reset on any non-busy outcome
|
|
|
|
|
busyDelaySec = 1 // reset back-off
|
|
|
|
|
|
|
|
|
|
switch {
|
|
|
|
|
case throttleReason != "":
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
|
|
|
|
|
case attempt.err != nil:
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
|
|
|
|
|
default:
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !canDerate || appliedLimitW <= 0 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
nextLimitW := appliedLimitW - derateStepW
|
|
|
|
|
if nextLimitW < minLimitW {
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
|
|
|
|
|
calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
appliedLimitW = nextLimitW
|
|
|
|
|
calib.AppliedPowerLimitW = float64(appliedLimitW)
|
|
|
|
|
calib.Derated = true
|
|
|
|
|
info.PowerLimitW = float64(appliedLimitW)
|
|
|
|
|
infoByIndex[idx] = info
|
|
|
|
|
calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
|
|
|
|
|
}
|
|
|
|
|
ticker.Stop()
|
|
|
|
|
cancelAttempt()
|
|
|
|
|
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
|
|
|
|
|
|
|
|
|
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
|
|
|
|
|
results[idx] = calib
|
|
|
|
|
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
|
|
|
|
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
|
|
|
|
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
|
|
|
|
|
for _, s := range active {
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
|
|
|
|
|
s.converged = true
|
|
|
|
|
}
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
|
|
|
|
|
break calibDone
|
|
|
|
|
}
|
|
|
|
|
busyRetries++
|
|
|
|
|
// Undo attempt counter: busy retries don't count as real attempts.
|
|
|
|
|
for _, s := range active {
|
|
|
|
|
s.calib.Attempts--
|
|
|
|
|
}
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
break calibDone
|
|
|
|
|
case <-time.After(time.Duration(busyDelaySec) * time.Second):
|
|
|
|
|
}
|
|
|
|
|
next := busyDelaySec * 2
|
|
|
|
|
if next > dcgmResourceBusyMaxDelaySec {
|
|
|
|
|
next = dcgmResourceBusyMaxDelaySec + 1
|
|
|
|
|
}
|
|
|
|
|
busyDelaySec = next
|
|
|
|
|
sharedAttempt-- // retry same logical attempt number
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
busyRetries = 0
|
|
|
|
|
busyDelaySec = 1
|
|
|
|
|
|
|
|
|
|
// Per-GPU analysis and binary search update.
|
|
|
|
|
for _, s := range active {
|
|
|
|
|
perGPU := filterRowsByGPU(ar.rows, s.idx)
|
|
|
|
|
summary := summarizeBenchmarkTelemetry(perGPU)
|
|
|
|
|
throttle := throttleReasons[s.idx]
|
|
|
|
|
|
|
|
|
|
// Cooling warning: thermal throttle with fans not at maximum.
|
|
|
|
|
if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
|
|
|
|
|
clocks := make([]float64, 0, len(perGPU))
|
|
|
|
|
var fanDutyValues []float64
|
|
|
|
|
fanDutyAvail := false
|
|
|
|
|
for _, r := range perGPU {
|
|
|
|
|
if r.ClockMHz > 0 {
|
|
|
|
|
clocks = append(clocks, r.ClockMHz)
|
|
|
|
|
}
|
|
|
|
|
if r.FanDutyCycleAvailable {
|
|
|
|
|
fanDutyAvail = true
|
|
|
|
|
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
dropPct := benchmarkClockDrift(clocks)
|
|
|
|
|
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
|
|
|
|
|
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
|
|
|
|
|
s.calib.CoolingWarning = fmt.Sprintf(
|
|
|
|
|
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
|
|
|
|
|
throttle, dropPct, p95FanDuty,
|
|
|
|
|
)
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
|
|
|
|
|
// Stable at current limit — update lo and binary-search upward.
|
|
|
|
|
s.calib.Summary = summary
|
|
|
|
|
s.calib.Completed = true
|
|
|
|
|
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
|
|
|
|
s.lo = s.appliedLimitW
|
|
|
|
|
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
|
|
|
|
next := roundTo5W((s.lo + s.hi) / 2)
|
|
|
|
|
if next > s.lo && next < s.hi {
|
|
|
|
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
|
|
|
|
|
s.appliedLimitW = next
|
|
|
|
|
s.calib.AppliedPowerLimitW = float64(next)
|
|
|
|
|
s.calib.Completed = false // keep searching
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
|
|
|
|
|
continue // next GPU in active list
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
s.converged = true
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Failed or throttled — log and binary-search downward.
|
|
|
|
|
switch {
|
|
|
|
|
case throttle != "":
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
|
|
|
|
case ar.err != nil:
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
|
|
|
|
|
default:
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !canDerate || s.appliedLimitW <= 0 {
|
|
|
|
|
s.converged = true
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
s.hi = s.appliedLimitW
|
|
|
|
|
|
|
|
|
|
if s.hi-s.lo <= calibSearchTolerance {
|
|
|
|
|
if s.lo > s.minLimitW {
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
|
|
|
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
|
|
|
|
s.appliedLimitW = s.lo
|
|
|
|
|
s.calib.AppliedPowerLimitW = float64(s.lo)
|
|
|
|
|
s.calib.Derated = s.lo < s.originalLimitW
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
|
|
|
|
}
|
|
|
|
|
s.converged = true
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
next := roundTo5W((s.lo + s.hi) / 2)
|
|
|
|
|
if next <= s.lo {
|
|
|
|
|
next = s.lo + calibSearchTolerance
|
|
|
|
|
}
|
|
|
|
|
if next >= s.hi {
|
|
|
|
|
next = (s.lo + s.hi) / 2
|
|
|
|
|
}
|
|
|
|
|
if next < s.minLimitW {
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
|
|
|
|
s.converged = true
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
|
|
|
|
|
s.converged = true
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
s.appliedLimitW = next
|
|
|
|
|
s.calib.AppliedPowerLimitW = float64(next)
|
|
|
|
|
s.calib.Derated = next < s.originalLimitW
|
|
|
|
|
s.info.PowerLimitW = float64(next)
|
|
|
|
|
infoByIndex[s.idx] = s.info
|
|
|
|
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
|
|
|
|
|
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, s := range states {
|
|
|
|
|
if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
|
|
|
|
|
results[s.idx] = s.calib
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return results, restore
|
|
|
|
|
@@ -2649,6 +2815,11 @@ func isDCGMResourceBusy(err error) bool {
|
|
|
|
|
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// roundTo5W rounds w to the nearest 5 W boundary.
|
|
|
|
|
func roundTo5W(w int) int {
|
|
|
|
|
return ((w + 2) / 5) * 5
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func powerBenchDurationSec(profile string) int {
|
|
|
|
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
|
|
|
|
case NvidiaBenchmarkProfileStability:
|
|
|
|
|
@@ -2823,6 +2994,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|
|
|
|
OccupiedSlots: occupied,
|
|
|
|
|
OccupiedSlotsNote: note,
|
|
|
|
|
Notes: append([]string(nil), calib.Notes...),
|
|
|
|
|
CoolingWarning: calib.CoolingWarning,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
sort.Slice(gpus, func(i, j int) bool {
|
|
|
|
|
@@ -2849,6 +3021,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|
|
|
|
if gpu.Derated {
|
|
|
|
|
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
|
|
|
|
}
|
|
|
|
|
if gpu.CoolingWarning != "" {
|
|
|
|
|
result.Findings = append(result.Findings, fmt.Sprintf(
|
|
|
|
|
"GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.",
|
|
|
|
|
gpu.Index, gpu.CoolingWarning,
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
|
|
|
|
|
for _, gpu := range gpus {
|
|
|
|
|
|