Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 19dbabd71d |
@@ -2476,9 +2476,6 @@ func runBenchmarkPowerCalibration(
|
|||||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||||
const calibSearchTolerance = 10
|
const calibSearchTolerance = 10
|
||||||
// calibPreThrottleMarginW is subtracted from the telemetry-estimated
|
|
||||||
// pre-throttle power draw to produce a smarter initial search candidate.
|
|
||||||
const calibPreThrottleMarginW = 10
|
|
||||||
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
// dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM
|
||||||
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
// returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, …
|
||||||
// doubling each retry until it would exceed the cap, at which point the
|
// doubling each retry until it would exceed the cap, at which point the
|
||||||
@@ -2717,20 +2714,8 @@ func runBenchmarkPowerCalibration(
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute the next candidate.
|
// Binary midpoint within the remaining search range.
|
||||||
// For thermal throttle: use the pre-throttle power draw from telemetry
|
nextLimitW := roundTo5W((lo + hi) / 2)
|
||||||
// as a smarter initial estimate instead of the binary midpoint — it
|
|
||||||
// lands much closer to the true limit on the first attempt.
|
|
||||||
nextLimitW := (lo + hi) / 2
|
|
||||||
if strings.Contains(throttleReason, "thermal") {
|
|
||||||
if onsetW := calibPreThrottlePowerW(perGPU); onsetW > 0 {
|
|
||||||
candidate := roundTo5W(int(math.Round(onsetW)) - calibPreThrottleMarginW)
|
|
||||||
if candidate > lo && candidate < hi {
|
|
||||||
nextLimitW = candidate
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
nextLimitW = roundTo5W(nextLimitW)
|
|
||||||
// Ensure the candidate is strictly inside the search range.
|
// Ensure the candidate is strictly inside the search range.
|
||||||
if nextLimitW <= lo {
|
if nextLimitW <= lo {
|
||||||
nextLimitW = lo + calibSearchTolerance
|
nextLimitW = lo + calibSearchTolerance
|
||||||
@@ -2770,28 +2755,6 @@ func isDCGMResourceBusy(err error) bool {
|
|||||||
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
return errors.As(err, &exitErr) && exitErr.ExitCode() == 222
|
||||||
}
|
}
|
||||||
|
|
||||||
// calibPreThrottlePowerW estimates the GPU power draw just before thermal
|
|
||||||
// throttle onset by averaging the first quarter of telemetry rows. The early
|
|
||||||
// samples capture the GPU at peak before clock/power reduction kicks in.
|
|
||||||
func calibPreThrottlePowerW(rows []GPUMetricRow) float64 {
|
|
||||||
if len(rows) < 4 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
n := len(rows) / 4
|
|
||||||
var sum float64
|
|
||||||
var cnt int
|
|
||||||
for _, r := range rows[:n] {
|
|
||||||
if r.PowerW > 0 {
|
|
||||||
sum += r.PowerW
|
|
||||||
cnt++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if cnt == 0 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
return sum / float64(cnt)
|
|
||||||
}
|
|
||||||
|
|
||||||
// roundTo5W rounds w to the nearest 5 W boundary.
|
// roundTo5W rounds w to the nearest 5 W boundary.
|
||||||
func roundTo5W(w int) int {
|
func roundTo5W(w int) int {
|
||||||
return ((w + 2) / 5) * 5
|
return ((w + 2) / 5) * 5
|
||||||
|
|||||||
Reference in New Issue
Block a user