Compare commits

..

2 Commits
v8.4 ... v8.6

Author SHA1 Message Date
3cf2e9c9dc Run power calibration for all GPUs simultaneously
Previously each GPU was calibrated sequentially (one card fully done
before the next started), producing the staircase temperature pattern
seen on the graph.

Now all GPUs run together in a single dcgmi diag -r targeted_power
session per attempt. This means:
- All cards are under realistic thermal load at the same time.
- A single DCGM session handles the run — no resource-busy contention
  from concurrent dcgmi processes.
- Binary search state (lo/hi) is tracked independently per GPU; each
  card converges to its own highest stable power limit.
- Throttle counter polling covers all active GPUs in the shared ticker.
- Resource-busy exponential back-off is shared (one DCGM session).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:25:05 +03:00
19dbabd71d Simplify power calibration: pure binary search, no telemetry guessing
Remove telemetry-guided initial candidate; use strict binary search
midpoint at every step. Clean and predictable convergence in O(log N)
attempts within the allowed power range [minLimitW, startingLimitW].

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:12:45 +03:00

View File

@@ -2476,9 +2476,6 @@ func runBenchmarkPowerCalibration(
// calibSearchTolerance is the binary-search convergence threshold in watts. // calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used. // When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
const calibSearchTolerance = 10 const calibSearchTolerance = 10
// calibPreThrottleMarginW is subtracted from the telemetry-estimated
// pre-throttle power draw to produce a smarter initial search candidate.
const calibPreThrottleMarginW = 10
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM // dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, … // returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
// doubling each retry until it would exceed the cap, at which point the // doubling each retry until it would exceed the cap, at which point the
@@ -2501,8 +2498,25 @@ func runBenchmarkPowerCalibration(
err error err error
} }
// gpuCalibState holds per-GPU binary search state during parallel calibration.
type gpuCalibState struct {
idx int
info benchmarkGPUInfo
originalLimitW int
appliedLimitW int
minLimitW int
lo int // highest verified-stable limit (assumed: minLimitW)
hi int // lowest verified-unstable limit (exclusive sentinel above start)
calib benchmarkPowerCalibrationResult
converged bool
}
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices)) results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
var restore []benchmarkRestoreAction var restore []benchmarkRestoreAction
// Initialise per-GPU state.
states := make([]*gpuCalibState, 0, len(gpuIndices))
for _, idx := range gpuIndices { for _, idx := range gpuIndices {
info := infoByIndex[idx] info := infoByIndex[idx]
originalLimitW := int(math.Round(info.PowerLimitW)) originalLimitW := int(math.Round(info.PowerLimitW))
@@ -2531,17 +2545,17 @@ func runBenchmarkPowerCalibration(
if minLimitW < calibSearchTolerance { if minLimitW < calibSearchTolerance {
minLimitW = calibSearchTolerance minLimitW = calibSearchTolerance
} }
s := &gpuCalibState{
calib := benchmarkPowerCalibrationResult{ idx: idx,
AppliedPowerLimitW: float64(appliedLimitW), info: info,
originalLimitW: originalLimitW,
appliedLimitW: appliedLimitW,
minLimitW: minLimitW,
lo: minLimitW,
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
} }
// Binary search bounds for finding the highest stable power limit. states = append(states, s)
// lo = highest verified-stable level (assumed: minLimitW).
// hi = lowest verified-unstable level (assumed: above the starting limit).
lo := minLimitW
hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
busyRetries := 0
busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
if canDerate && originalLimitW > 0 { if canDerate && originalLimitW > 0 {
idxCopy := idx idxCopy := idx
orig := originalLimitW orig := originalLimitW
@@ -2552,212 +2566,243 @@ func runBenchmarkPowerCalibration(
}, },
}) })
} }
}
calibLoop: // Shared DCGM resource-busy back-off state (single diagnostic session).
busyRetries := 0
busyDelaySec := 1
sharedAttempt := 0
type sharedAttemptResult struct {
out []byte
rows []GPUMetricRow
err error
}
calibDone:
for {
// Collect non-converged GPUs.
var active []*gpuCalibState
for _, s := range states {
if !s.converged {
active = append(active, s)
}
}
if len(active) == 0 || ctx.Err() != nil {
break
}
sharedAttempt++
for _, s := range active {
s.calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
}
// Snapshot throttle counters for all active GPUs before the run.
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
for _, s := range active {
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
}
// Run targeted_power for ALL gpuIndices simultaneously so every card
// is under load during calibration — this reflects real server thermals.
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
attemptCtx, cancelAttempt := context.WithCancel(ctx)
doneCh := make(chan sharedAttemptResult, 1)
go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
}()
ticker := time.NewTicker(time.Second)
throttleReasons := make(map[int]string, len(active))
var ar sharedAttemptResult
attemptLoop:
for { for {
calib.Attempts++ select {
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec)) case ar = <-doneCh:
break attemptLoop
beforeThrottle, _ := queryThrottleCounters(idx) case <-ticker.C:
attemptCtx, cancel := context.WithCancel(ctx) // Poll throttle counters for each active GPU independently.
doneCh := make(chan calibrationAttemptResult, 1) for _, s := range active {
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts) if throttleReasons[s.idx] != "" {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx}) continue // already detected for this GPU
go func() { }
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc) after, err := queryThrottleCounters(s.idx)
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
}()
ticker := time.NewTicker(time.Second)
var (
attempt calibrationAttemptResult
throttleReason string
)
attemptLoop:
for {
select {
case attempt = <-doneCh:
break attemptLoop
case <-ticker.C:
afterThrottle, err := queryThrottleCounters(idx)
if err != nil { if err != nil {
continue continue
} }
// Record the throttle reason but do NOT cancel the dcgmi // Record throttle but do NOT cancel — let dcgmi finish so
// process. Killing it mid-run leaves nv-hostengine holding // nv-hostengine releases the slot cleanly before the next attempt.
// the diagnostic slot, which causes DCGM_ST_IN_USE on every if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
// subsequent attempt. Let targeted_power run to its natural throttleReasons[s.idx] = reason
// end so the daemon releases the slot cleanly before we logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
// reduce power and retry.
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
throttleReason = reason
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
} }
case <-ctx.Done():
cancel()
attempt = <-doneCh
break attemptLoop
} }
case <-ctx.Done():
cancelAttempt()
ar = <-doneCh
break attemptLoop
} }
ticker.Stop() }
cancel() ticker.Stop()
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644) cancelAttempt()
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
perGPU := filterRowsByGPU(attempt.rows, idx) // Resource busy: retry with exponential back-off (shared — one DCGM session).
if ar.err != nil && isDCGMResourceBusy(ar.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
for _, s := range active {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
s.converged = true
}
logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
break calibDone
}
busyRetries++
// Undo attempt counter: busy retries don't count as real attempts.
for _, s := range active {
s.calib.Attempts--
}
logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
select {
case <-ctx.Done():
break calibDone
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1
}
busyDelaySec = next
sharedAttempt-- // retry same logical attempt number
continue
}
busyRetries = 0
busyDelaySec = 1
// Per-GPU analysis and binary search update.
for _, s := range active {
perGPU := filterRowsByGPU(ar.rows, s.idx)
summary := summarizeBenchmarkTelemetry(perGPU) summary := summarizeBenchmarkTelemetry(perGPU)
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 { throttle := throttleReasons[s.idx]
// Stable at appliedLimitW: record it and binary-search upward.
calib.Summary = summary // Cooling warning: thermal throttle with fans not at maximum.
calib.Completed = true if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
calib.AppliedPowerLimitW = float64(appliedLimitW) clocks := make([]float64, 0, len(perGPU))
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) var fanDutyValues []float64
lo = appliedLimitW fanDutyAvail := false
// If there is still headroom to search, try a higher level. for _, r := range perGPU {
if canDerate && hi-lo > calibSearchTolerance { if r.ClockMHz > 0 {
nextLimitW := roundTo5W((lo + hi) / 2) clocks = append(clocks, r.ClockMHz)
if nextLimitW > lo && nextLimitW < hi { }
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil { if r.FanDutyCycleAvailable {
appliedLimitW = nextLimitW fanDutyAvail = true
calib.AppliedPowerLimitW = float64(appliedLimitW) fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi)) }
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW)) }
continue calibLoop dropPct := benchmarkClockDrift(clocks)
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
s.calib.CoolingWarning = fmt.Sprintf(
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
throttle, dropPct, p95FanDuty,
)
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
}
}
if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
// Stable at current limit — update lo and binary-search upward.
s.calib.Summary = summary
s.calib.Completed = true
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
s.lo = s.appliedLimitW
if canDerate && s.hi-s.lo > calibSearchTolerance {
next := roundTo5W((s.lo + s.hi) / 2)
if next > s.lo && next < s.hi {
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
s.appliedLimitW = next
s.calib.AppliedPowerLimitW = float64(next)
s.calib.Completed = false // keep searching
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
continue // next GPU in active list
} }
} }
} }
break s.converged = true
continue
} }
// If DCGM reports the resource is in use, nv-hostengine has not yet // Failed or throttled — log and binary-search downward.
// released the diagnostic slot from the previous attempt. Do not
// derate: wait with exponential back-off and retry at the same
// power limit. Once the back-off delay would exceed
// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
// held by something else.
if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
if busyDelaySec > dcgmResourceBusyMaxDelaySec {
calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
break
}
busyRetries++
logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
select {
case <-ctx.Done():
break calibLoop
case <-time.After(time.Duration(busyDelaySec) * time.Second):
}
next := busyDelaySec * 2
if next > dcgmResourceBusyMaxDelaySec {
next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
}
busyDelaySec = next
continue calibLoop
}
busyRetries = 0 // reset on any non-busy outcome
busyDelaySec = 1 // reset back-off
switch { switch {
case throttleReason != "": case throttle != "":
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
// Check whether the thermal throttle coincided with fans below case ar.err != nil:
// maximum: that combination suggests cooling misconfiguration s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
// rather than a fundamental power-delivery limit. logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
clocks := make([]float64, 0, len(perGPU))
var fanDutyValues []float64
fanDutyAvail := false
for _, r := range perGPU {
if r.ClockMHz > 0 {
clocks = append(clocks, r.ClockMHz)
}
if r.FanDutyCycleAvailable {
fanDutyAvail = true
fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
}
}
dropPct := benchmarkClockDrift(clocks)
p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
calib.CoolingWarning = fmt.Sprintf(
"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
throttleReason, dropPct, p95FanDuty,
)
logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
}
}
case attempt.err != nil:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
default: default:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
} }
if !canDerate || appliedLimitW <= 0 { if !canDerate || s.appliedLimitW <= 0 {
break s.converged = true
continue
} }
// Binary-search for the highest stable power limit. s.hi = s.appliedLimitW
// This attempt failed or throttled, so update the upper bound.
hi = appliedLimitW
if hi-lo <= calibSearchTolerance { if s.hi-s.lo <= calibSearchTolerance {
// Search range exhausted: lo is the highest verified-stable level. if s.lo > s.minLimitW {
if lo > minLimitW { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi)) if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil { s.appliedLimitW = s.lo
appliedLimitW = lo s.calib.AppliedPowerLimitW = float64(s.lo)
calib.AppliedPowerLimitW = float64(lo) s.calib.Derated = s.lo < s.originalLimitW
calib.Derated = lo < originalLimitW
} }
} else { } else {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
} }
break s.converged = true
continue
} }
// Compute the next candidate. next := roundTo5W((s.lo + s.hi) / 2)
// For thermal throttle: use the pre-throttle power draw from telemetry if next <= s.lo {
// as a smarter initial estimate instead of the binary midpoint — it next = s.lo + calibSearchTolerance
// lands much closer to the true limit on the first attempt.
nextLimitW := (lo + hi) / 2
if strings.Contains(throttleReason, "thermal") {
if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
if candidate > lo && candidate < hi {
nextLimitW = candidate
}
}
} }
nextLimitW = roundTo5W(nextLimitW) if next >= s.hi {
// Ensure the candidate is strictly inside the search range. next = (s.lo + s.hi) / 2
if nextLimitW <= lo {
nextLimitW = lo + calibSearchTolerance
} }
if nextLimitW >= hi { if next < s.minLimitW {
nextLimitW = (lo + hi) / 2 s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
s.converged = true
continue
} }
if nextLimitW < minLimitW { if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
break logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
s.converged = true
continue
} }
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil { s.appliedLimitW = next
calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error()) s.calib.AppliedPowerLimitW = float64(next)
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err)) s.calib.Derated = next < s.originalLimitW
break s.info.PowerLimitW = float64(next)
} infoByIndex[s.idx] = s.info
appliedLimitW = nextLimitW s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
calib.AppliedPowerLimitW = float64(appliedLimitW) logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
calib.Derated = appliedLimitW < originalLimitW
info.PowerLimitW = float64(appliedLimitW)
infoByIndex[idx] = info
calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
} }
}
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 { for _, s := range states {
results[idx] = calib if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
results[s.idx] = s.calib
} }
} }
return results, restore return results, restore
@@ -2770,28 +2815,6 @@ func isDCGMResourceBusy(err error) bool {
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222 return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
} }
// calibPreThrottlePowerW estimates the GPU power draw just before thermal
// throttle onset by averaging the first quarter of telemetry rows. The early
// samples capture the GPU at peak before clock/power reduction kicks in.
func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
if len(rows) < 4 {
return 0
}
n := len(rows) / 4
var sum float64
var cnt int
for _, r := range rows[:n] {
if r.PowerW > 0 {
sum += r.PowerW
cnt++
}
}
if cnt == 0 {
return 0
}
return sum / float64(cnt)
}
// roundTo5W rounds w to the nearest 5 W boundary. // roundTo5W rounds w to the nearest 5 W boundary.
func roundTo5W(w int) int { func roundTo5W(w int) int {
return ((w + 2) / 5) * 5 return ((w + 2) / 5) * 5