Use real NVIDIA power-limit bounds in benchmark
This commit is contained in:
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
|
||||
VBIOS string
|
||||
PowerLimitW float64
|
||||
DefaultPowerLimitW float64
|
||||
MinPowerLimitW float64
|
||||
MaxPowerLimitW float64
|
||||
MaxGraphicsClockMHz float64
|
||||
MaxMemoryClockMHz float64
|
||||
BaseGraphicsClockMHz float64
|
||||
@@ -349,9 +351,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||
result.Normalization.Status = "partial"
|
||||
}
|
||||
// Enrich with max clocks from verbose output — covers GPUs where
|
||||
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
|
||||
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
|
||||
// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
|
||||
// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
|
||||
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
|
||||
|
||||
activeApps, err := queryActiveComputeApps(selected)
|
||||
if err == nil && len(activeApps) > 0 {
|
||||
@@ -735,8 +737,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
||||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||
// all driver versions, so we fall back to the base set if the full query fails.
|
||||
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
||||
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
|
||||
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
|
||||
// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
|
||||
// then recovered from nvidia-smi -q.
|
||||
var benchmarkGPUInfoQueries = []struct {
|
||||
fields string
|
||||
extended bool // whether this query includes optional extended fields
|
||||
@@ -756,12 +758,9 @@ var benchmarkGPUInfoQueries = []struct {
|
||||
},
|
||||
}
|
||||
|
||||
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
|
||||
// any GPU in infoByIndex where those values are still zero. It parses the
|
||||
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
|
||||
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
|
||||
// return exit status 2 but the verbose query works fine.
|
||||
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||
// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
|
||||
// for fields that may be missing from --query-gpu on some driver versions.
|
||||
func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
||||
return
|
||||
}
|
||||
@@ -782,6 +781,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
||||
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||||
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||||
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
|
||||
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
|
||||
@@ -841,6 +842,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.MinPowerLimitW == 0 {
|
||||
if m := minPwrRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||
info.MinPowerLimitW = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.MaxPowerLimitW == 0 {
|
||||
if m := maxPwrRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||
info.MaxPowerLimitW = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if info.MultiprocessorCount == 0 {
|
||||
if m := smCountRe.FindSubmatch(section); m != nil {
|
||||
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
||||
@@ -3041,7 +3056,6 @@ func runBenchmarkPowerCalibration(
|
||||
if calibDurationSec <= 0 {
|
||||
calibDurationSec = 120
|
||||
}
|
||||
const maxDerateW = 150
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||
const calibSearchTolerance = 10
|
||||
@@ -3088,8 +3102,9 @@ func runBenchmarkPowerCalibration(
|
||||
originalLimitW int
|
||||
appliedLimitW int
|
||||
minLimitW int
|
||||
lo int // highest verified-stable limit (assumed: minLimitW)
|
||||
lo int // highest verified-stable limit
|
||||
hi int // lowest verified-unstable limit (exclusive sentinel above start)
|
||||
loVerified bool
|
||||
calib benchmarkPowerCalibrationResult
|
||||
converged bool
|
||||
}
|
||||
@@ -3115,19 +3130,13 @@ func runBenchmarkPowerCalibration(
|
||||
if appliedLimitW <= 0 {
|
||||
appliedLimitW = defaultLimitW
|
||||
}
|
||||
minLimitW := appliedLimitW
|
||||
switch {
|
||||
case defaultLimitW > 0:
|
||||
minLimitW = defaultLimitW - maxDerateW
|
||||
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
|
||||
if minLimitW < floorByRatio {
|
||||
minLimitW = floorByRatio
|
||||
}
|
||||
case appliedLimitW > 0:
|
||||
minLimitW = appliedLimitW - maxDerateW
|
||||
minLimitW := int(math.Round(info.MinPowerLimitW))
|
||||
if minLimitW <= 0 {
|
||||
minLimitW = appliedLimitW
|
||||
}
|
||||
if minLimitW < calibSearchTolerance {
|
||||
minLimitW = calibSearchTolerance
|
||||
maxLimitW := int(math.Round(info.MaxPowerLimitW))
|
||||
if maxLimitW > 0 && appliedLimitW > maxLimitW {
|
||||
appliedLimitW = maxLimitW
|
||||
}
|
||||
s := &gpuCalibState{
|
||||
idx: idx,
|
||||
@@ -3139,11 +3148,24 @@ func runBenchmarkPowerCalibration(
|
||||
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
|
||||
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
|
||||
}
|
||||
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
|
||||
s.appliedLimitW = minLimitW
|
||||
s.hi = minLimitW + 1
|
||||
}
|
||||
if info.MinPowerLimitW <= 0 {
|
||||
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
|
||||
}
|
||||
if seedLimits != nil {
|
||||
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
|
||||
// A previously validated limit is only a starting point. Re-run
|
||||
// targeted_power under the current multi-GPU thermal load and derate
|
||||
// again if this step shows new throttling.
|
||||
if seedW < s.minLimitW {
|
||||
seedW = s.minLimitW
|
||||
}
|
||||
if maxLimitW > 0 && seedW > maxLimitW {
|
||||
seedW = maxLimitW
|
||||
}
|
||||
if canDerate {
|
||||
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
|
||||
}
|
||||
@@ -3331,6 +3353,7 @@ calibDone:
|
||||
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
|
||||
s.lo = s.appliedLimitW
|
||||
s.loVerified = true
|
||||
if canDerate && s.hi-s.lo > calibSearchTolerance {
|
||||
next := roundTo5W((s.lo + s.hi) / 2)
|
||||
if next > s.lo && next < s.hi {
|
||||
@@ -3369,7 +3392,23 @@ calibDone:
|
||||
s.hi = s.appliedLimitW
|
||||
|
||||
if s.hi-s.lo <= calibSearchTolerance {
|
||||
if s.lo > s.minLimitW {
|
||||
if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
|
||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
s.appliedLimitW = s.minLimitW
|
||||
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
|
||||
s.calib.Derated = s.minLimitW < s.originalLimitW
|
||||
s.info.PowerLimitW = float64(s.minLimitW)
|
||||
infoByIndex[s.idx] = s.info
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
|
||||
continue
|
||||
}
|
||||
if s.loVerified {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
|
||||
s.appliedLimitW = s.lo
|
||||
@@ -3381,7 +3420,8 @@ calibDone:
|
||||
s.calib.Completed = true
|
||||
}
|
||||
} else {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
|
||||
}
|
||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||
s.converged = true
|
||||
@@ -3396,9 +3436,7 @@ calibDone:
|
||||
next = (s.lo + s.hi) / 2
|
||||
}
|
||||
if next < s.minLimitW {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||
s.converged = true
|
||||
continue
|
||||
next = s.minLimitW
|
||||
}
|
||||
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
|
||||
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
|
||||
|
||||
Reference in New Issue
Block a user