Use real NVIDIA power-limit bounds in benchmark

This commit is contained in:
Mikhail Chusavitin
2026-04-20 09:26:56 +03:00
parent a94e8007f8
commit ab802719f8
2 changed files with 101 additions and 35 deletions

View File

@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
VBIOS string
PowerLimitW float64
DefaultPowerLimitW float64
MinPowerLimitW float64
MaxPowerLimitW float64
MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64
@@ -349,9 +351,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
result.Normalization.Status = "partial"
}
// Enrich with max clocks from verbose output — covers GPUs where
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
activeApps, err := queryActiveComputeApps(selected)
if err == nil && len(activeApps) > 0 {
@@ -735,8 +737,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
// (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails.
// The minimal fallback omits clock fields entirely — clocks.max.* returns
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
// then recovered from nvidia-smi -q.
var benchmarkGPUInfoQueries = []struct {
fields string
extended bool // whether this query includes optional extended fields
@@ -756,12 +758,9 @@ var benchmarkGPUInfoQueries = []struct {
},
}
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
// any GPU in infoByIndex where those values are still zero. It parses the
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
// return exit status 2 but the verbose query works fine.
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
// for fields that may be missing from --query-gpu on some driver versions.
func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
return
}
@@ -782,6 +781,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
@@ -841,6 +842,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
}
}
}
if info.MinPowerLimitW == 0 {
if m := minPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.MinPowerLimitW = v
}
}
}
if info.MaxPowerLimitW == 0 {
if m := maxPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.MaxPowerLimitW = v
}
}
}
if info.MultiprocessorCount == 0 {
if m := smCountRe.FindSubmatch(section); m != nil {
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
@@ -3041,7 +3056,6 @@ func runBenchmarkPowerCalibration(
if calibDurationSec <= 0 {
calibDurationSec = 120
}
const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
const calibSearchTolerance = 10
@@ -3088,8 +3102,9 @@ func runBenchmarkPowerCalibration(
originalLimitW int
appliedLimitW int
minLimitW int
lo int // highest verified-stable limit (assumed: minLimitW)
lo int // highest verified-stable limit
hi int // lowest verified-unstable limit (exclusive sentinel above start)
loVerified bool
calib benchmarkPowerCalibrationResult
converged bool
}
@@ -3115,19 +3130,13 @@ func runBenchmarkPowerCalibration(
if appliedLimitW <= 0 {
appliedLimitW = defaultLimitW
}
minLimitW := appliedLimitW
switch {
case defaultLimitW > 0:
minLimitW = defaultLimitW - maxDerateW
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
if minLimitW < floorByRatio {
minLimitW = floorByRatio
}
case appliedLimitW > 0:
minLimitW = appliedLimitW - maxDerateW
minLimitW := int(math.Round(info.MinPowerLimitW))
if minLimitW <= 0 {
minLimitW = appliedLimitW
}
if minLimitW < calibSearchTolerance {
minLimitW = calibSearchTolerance
maxLimitW := int(math.Round(info.MaxPowerLimitW))
if maxLimitW > 0 && appliedLimitW > maxLimitW {
appliedLimitW = maxLimitW
}
s := &gpuCalibState{
idx: idx,
@@ -3139,11 +3148,24 @@ func runBenchmarkPowerCalibration(
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
}
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
s.appliedLimitW = minLimitW
s.hi = minLimitW + 1
}
if info.MinPowerLimitW <= 0 {
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
}
if seedLimits != nil {
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
// A previously validated limit is only a starting point. Re-run
// targeted_power under the current multi-GPU thermal load and derate
// again if this step shows new throttling.
if seedW < s.minLimitW {
seedW = s.minLimitW
}
if maxLimitW > 0 && seedW > maxLimitW {
seedW = maxLimitW
}
if canDerate {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
}
@@ -3331,6 +3353,7 @@ calibDone:
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
s.lo = s.appliedLimitW
s.loVerified = true
if canDerate && s.hi-s.lo > calibSearchTolerance {
next := roundTo5W((s.lo + s.hi) / 2)
if next > s.lo && next < s.hi {
@@ -3369,7 +3392,23 @@ calibDone:
s.hi = s.appliedLimitW
if s.hi-s.lo <= calibSearchTolerance {
if s.lo > s.minLimitW {
if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
s.converged = true
continue
}
s.appliedLimitW = s.minLimitW
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
s.calib.Derated = s.minLimitW < s.originalLimitW
s.info.PowerLimitW = float64(s.minLimitW)
infoByIndex[s.idx] = s.info
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
continue
}
if s.loVerified {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
s.appliedLimitW = s.lo
@@ -3381,7 +3420,8 @@ calibDone:
s.calib.Completed = true
}
} else {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
}
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true
@@ -3396,9 +3436,7 @@ calibDone:
next = (s.lo + s.hi) / 2
}
if next < s.minLimitW {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
s.converged = true
continue
next = s.minLimitW
}
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())

View File

@@ -356,12 +356,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
}
}
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
t.Parallel()
nvsmiQ := []byte(`
GPU 00000000:4E:00.0
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
Min Power Limit : 200.00 W
Max Power Limit : 600.00 W
Default Power Limit : 575.00 W
Current Power Limit : 560.00 W
Clocks
Graphics : 2422 MHz
Memory : 12481 MHz
@@ -383,7 +387,7 @@ GPU 00000000:4F:00.0
1: {Index: 1, BusID: "00000000:4F:00.0"},
}
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -397,25 +401,49 @@ GPU 00000000:4F:00.0
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
}
if infoByIndex[0].MinPowerLimitW != 200 {
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
}
if infoByIndex[0].MaxPowerLimitW != 600 {
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
}
if infoByIndex[0].DefaultPowerLimitW != 575 {
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
}
if infoByIndex[0].PowerLimitW != 560 {
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
}
}
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
t.Parallel()
nvsmiQ := []byte(`
GPU 00000000:4E:00.0
Min Power Limit : 100.00 W
Max Power Limit : 900.00 W
Max Clocks
Graphics : 9999 MHz
Memory : 9999 MHz
`)
// Already populated — must not be overwritten.
infoByIndex := map[int]benchmarkGPUInfo{
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
0: {
Index: 0,
BusID: "00000000:4E:00.0",
MaxGraphicsClockMHz: 2430,
MaxMemoryClockMHz: 12481,
MinPowerLimitW: 200,
MaxPowerLimitW: 600,
},
}
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
}
if infoByIndex[0].MinPowerLimitW != 200 {
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
}
}