Use real NVIDIA power-limit bounds in benchmark

This commit is contained in:
Mikhail Chusavitin
2026-04-20 09:26:56 +03:00
parent a94e8007f8
commit ab802719f8
2 changed files with 101 additions and 35 deletions

View File

@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
VBIOS string VBIOS string
PowerLimitW float64 PowerLimitW float64
DefaultPowerLimitW float64 DefaultPowerLimitW float64
MinPowerLimitW float64
MaxPowerLimitW float64
MaxGraphicsClockMHz float64 MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64 MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64 BaseGraphicsClockMHz float64
@@ -349,9 +351,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
result.Normalization.Status = "partial" result.Normalization.Status = "partial"
} }
// Enrich with max clocks from verbose output — covers GPUs where // Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x). // are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut) enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
activeApps, err := queryActiveComputeApps(selected) activeApps, err := queryActiveComputeApps(selected)
if err == nil && len(activeApps) > 0 { if err == nil && len(activeApps) > 0 {
@@ -735,8 +737,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
// (attribute.multiprocessor_count, power.default_limit) are not supported on // (attribute.multiprocessor_count, power.default_limit) are not supported on
// all driver versions, so we fall back to the base set if the full query fails. // all driver versions, so we fall back to the base set if the full query fails.
// The minimal fallback omits clock fields entirely — clocks.max.* returns // The minimal fallback omits clock fields entirely — clocks.max.* returns
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are // exit status 2 on some GPU generations (e.g. Blackwell); missing data is
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks. // then recovered from nvidia-smi -q.
var benchmarkGPUInfoQueries = []struct { var benchmarkGPUInfoQueries = []struct {
fields string fields string
extended bool // whether this query includes optional extended fields extended bool // whether this query includes optional extended fields
@@ -756,12 +758,9 @@ var benchmarkGPUInfoQueries = []struct {
}, },
} }
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for // enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
// any GPU in infoByIndex where those values are still zero. It parses the // for fields that may be missing from --query-gpu on some driver versions.
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ). func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
// return exit status 2 but the verbose query works fine.
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 { if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
return return
} }
@@ -782,6 +781,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`) shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`) slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
@@ -841,6 +842,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
} }
} }
} }
if info.MinPowerLimitW == 0 {
if m := minPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.MinPowerLimitW = v
}
}
}
if info.MaxPowerLimitW == 0 {
if m := maxPwrRe.FindSubmatch(section); m != nil {
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
info.MaxPowerLimitW = v
}
}
}
if info.MultiprocessorCount == 0 { if info.MultiprocessorCount == 0 {
if m := smCountRe.FindSubmatch(section); m != nil { if m := smCountRe.FindSubmatch(section); m != nil {
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 { if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
@@ -3041,7 +3056,6 @@ func runBenchmarkPowerCalibration(
if calibDurationSec <= 0 { if calibDurationSec <= 0 {
calibDurationSec = 120 calibDurationSec = 120
} }
const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts. // calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used. // When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
const calibSearchTolerance = 10 const calibSearchTolerance = 10
@@ -3088,8 +3102,9 @@ func runBenchmarkPowerCalibration(
originalLimitW int originalLimitW int
appliedLimitW int appliedLimitW int
minLimitW int minLimitW int
lo int // highest verified-stable limit (assumed: minLimitW) lo int // highest verified-stable limit
hi int // lowest verified-unstable limit (exclusive sentinel above start) hi int // lowest verified-unstable limit (exclusive sentinel above start)
loVerified bool
calib benchmarkPowerCalibrationResult calib benchmarkPowerCalibrationResult
converged bool converged bool
} }
@@ -3115,19 +3130,13 @@ func runBenchmarkPowerCalibration(
if appliedLimitW <= 0 { if appliedLimitW <= 0 {
appliedLimitW = defaultLimitW appliedLimitW = defaultLimitW
} }
minLimitW := appliedLimitW minLimitW := int(math.Round(info.MinPowerLimitW))
switch { if minLimitW <= 0 {
case defaultLimitW > 0: minLimitW = appliedLimitW
minLimitW = defaultLimitW - maxDerateW
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
if minLimitW < floorByRatio {
minLimitW = floorByRatio
}
case appliedLimitW > 0:
minLimitW = appliedLimitW - maxDerateW
} }
if minLimitW < calibSearchTolerance { maxLimitW := int(math.Round(info.MaxPowerLimitW))
minLimitW = calibSearchTolerance if maxLimitW > 0 && appliedLimitW > maxLimitW {
appliedLimitW = maxLimitW
} }
s := &gpuCalibState{ s := &gpuCalibState{
idx: idx, idx: idx,
@@ -3139,11 +3148,24 @@ func runBenchmarkPowerCalibration(
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)}, calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
} }
if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
s.appliedLimitW = minLimitW
s.hi = minLimitW + 1
}
if info.MinPowerLimitW <= 0 {
s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
}
if seedLimits != nil { if seedLimits != nil {
if seedW, ok := seedLimits[idx]; ok && seedW > 0 { if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
// A previously validated limit is only a starting point. Re-run // A previously validated limit is only a starting point. Re-run
// targeted_power under the current multi-GPU thermal load and derate // targeted_power under the current multi-GPU thermal load and derate
// again if this step shows new throttling. // again if this step shows new throttling.
if seedW < s.minLimitW {
seedW = s.minLimitW
}
if maxLimitW > 0 && seedW > maxLimitW {
seedW = maxLimitW
}
if canDerate { if canDerate {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW) _ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
} }
@@ -3331,6 +3353,7 @@ calibDone:
s.calib.AppliedPowerLimitW = float64(s.appliedLimitW) s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
s.lo = s.appliedLimitW s.lo = s.appliedLimitW
s.loVerified = true
if canDerate && s.hi-s.lo > calibSearchTolerance { if canDerate && s.hi-s.lo > calibSearchTolerance {
next := roundTo5W((s.lo + s.hi) / 2) next := roundTo5W((s.lo + s.hi) / 2)
if next > s.lo && next < s.hi { if next > s.lo && next < s.hi {
@@ -3369,7 +3392,23 @@ calibDone:
s.hi = s.appliedLimitW s.hi = s.appliedLimitW
if s.hi-s.lo <= calibSearchTolerance { if s.hi-s.lo <= calibSearchTolerance {
if s.lo > s.minLimitW { if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
s.converged = true
continue
}
s.appliedLimitW = s.minLimitW
s.calib.AppliedPowerLimitW = float64(s.minLimitW)
s.calib.Derated = s.minLimitW < s.originalLimitW
s.info.PowerLimitW = float64(s.minLimitW)
infoByIndex[s.idx] = s.info
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
continue
}
if s.loVerified {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil { if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
s.appliedLimitW = s.lo s.appliedLimitW = s.lo
@@ -3381,7 +3420,8 @@ calibDone:
s.calib.Completed = true s.calib.Completed = true
} }
} else { } else {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW)) s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
} }
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx) s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
s.converged = true s.converged = true
@@ -3396,9 +3436,7 @@ calibDone:
next = (s.lo + s.hi) / 2 next = (s.lo + s.hi) / 2
} }
if next < s.minLimitW { if next < s.minLimitW {
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW)) next = s.minLimitW
s.converged = true
continue
} }
if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil { if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error()) s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())

View File

@@ -356,12 +356,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
} }
} }
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) { func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
t.Parallel() t.Parallel()
nvsmiQ := []byte(` nvsmiQ := []byte(`
GPU 00000000:4E:00.0 GPU 00000000:4E:00.0
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
Min Power Limit : 200.00 W
Max Power Limit : 600.00 W
Default Power Limit : 575.00 W
Current Power Limit : 560.00 W
Clocks Clocks
Graphics : 2422 MHz Graphics : 2422 MHz
Memory : 12481 MHz Memory : 12481 MHz
@@ -383,7 +387,7 @@ GPU 00000000:4F:00.0
1: {Index: 1, BusID: "00000000:4F:00.0"}, 1: {Index: 1, BusID: "00000000:4F:00.0"},
} }
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ) enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 { if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz) t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -397,25 +401,49 @@ GPU 00000000:4F:00.0
if infoByIndex[1].MaxMemoryClockMHz != 12481 { if infoByIndex[1].MaxMemoryClockMHz != 12481 {
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz) t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
} }
if infoByIndex[0].MinPowerLimitW != 200 {
t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
}
if infoByIndex[0].MaxPowerLimitW != 600 {
t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
}
if infoByIndex[0].DefaultPowerLimitW != 575 {
t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
}
if infoByIndex[0].PowerLimitW != 560 {
t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
}
} }
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) { func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
t.Parallel() t.Parallel()
nvsmiQ := []byte(` nvsmiQ := []byte(`
GPU 00000000:4E:00.0 GPU 00000000:4E:00.0
Min Power Limit : 100.00 W
Max Power Limit : 900.00 W
Max Clocks Max Clocks
Graphics : 9999 MHz Graphics : 9999 MHz
Memory : 9999 MHz Memory : 9999 MHz
`) `)
// Already populated — must not be overwritten. // Already populated — must not be overwritten.
infoByIndex := map[int]benchmarkGPUInfo{ infoByIndex := map[int]benchmarkGPUInfo{
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481}, 0: {
Index: 0,
BusID: "00000000:4E:00.0",
MaxGraphicsClockMHz: 2430,
MaxMemoryClockMHz: 12481,
MinPowerLimitW: 200,
MaxPowerLimitW: 600,
},
} }
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ) enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
if infoByIndex[0].MaxGraphicsClockMHz != 2430 { if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz) t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
} }
if infoByIndex[0].MinPowerLimitW != 200 {
t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
}
} }