Recover power limits and SM count from nvidia-smi -q in enrichGPUInfo
When --query-gpu CSV fields fail (exit status 2 on some Blackwell + driver combos), enrichGPUInfoWithMaxClocks now also parses from the verbose nvidia-smi -q output already collected at benchmark start: - Default Power Limit → DefaultPowerLimitW - Current Power Limit → PowerLimitW (fallback) - Multiprocessor Count → MultiprocessorCount Fixes PowerSustainScore=0 on systems where all three CSV query variants fail but nvidia-smi -q succeeds (confirmed on RTX PRO 6000 Blackwell + driver 590.48.01). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -444,8 +444,11 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
|
|
||||||
// Split the verbose output into per-GPU sections on "^GPU " lines.
|
// Split the verbose output into per-GPU sections on "^GPU " lines.
|
||||||
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
|
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
|
||||||
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
|
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
|
||||||
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||||||
|
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
|
||||||
|
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
|
||||||
|
|
||||||
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
|
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
|
||||||
for i, loc := range sectionStarts {
|
for i, loc := range sectionStarts {
|
||||||
@@ -466,17 +469,14 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
info := infoByIndex[benchIdx]
|
|
||||||
if info.MaxGraphicsClockMHz > 0 && info.MaxMemoryClockMHz > 0 {
|
|
||||||
continue // already populated
|
|
||||||
}
|
|
||||||
|
|
||||||
end := len(nvsmiQ)
|
end := len(nvsmiQ)
|
||||||
if i+1 < len(sectionStarts) {
|
if i+1 < len(sectionStarts) {
|
||||||
end = sectionStarts[i+1][0]
|
end = sectionStarts[i+1][0]
|
||||||
}
|
}
|
||||||
section := nvsmiQ[loc[0]:end]
|
section := nvsmiQ[loc[0]:end]
|
||||||
|
|
||||||
|
info := infoByIndex[benchIdx]
|
||||||
|
|
||||||
if info.MaxGraphicsClockMHz == 0 {
|
if info.MaxGraphicsClockMHz == 0 {
|
||||||
if m := maxGfxRe.FindSubmatch(section); m != nil {
|
if m := maxGfxRe.FindSubmatch(section); m != nil {
|
||||||
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
|
||||||
@@ -491,6 +491,27 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if info.DefaultPowerLimitW == 0 {
|
||||||
|
if m := defaultPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.DefaultPowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if info.PowerLimitW == 0 {
|
||||||
|
if m := currentPwrRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
|
||||||
|
info.PowerLimitW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if info.MultiprocessorCount == 0 {
|
||||||
|
if m := smCountRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
|
||||||
|
info.MultiprocessorCount = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
infoByIndex[benchIdx] = info
|
infoByIndex[benchIdx] = info
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -857,19 +878,22 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|||||||
score.ComputeScore += precision.TeraOpsPerSec
|
score.ComputeScore += precision.TeraOpsPerSec
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// PowerSustainScore: prefer calibrated peak power (measured under targeted_power
|
// PowerSustainScore: measures how close the GPU came to its rated TDP under
|
||||||
// load) as the reference — it reflects what this GPU actually reaches under a
|
// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
|
||||||
// full-spectrum workload, unlike the hardware default limit which bee-gpu-burn
|
// Penalty applied symmetrically for both under- and over-TDP deviations:
|
||||||
// cannot reach. Fall back to default limit, then enforced limit.
|
// score = max(0, 100 − |measured − rated| / rated × 100)
|
||||||
referencePowerW := gpu.CalibratedPeakPowerW
|
// Under-TDP → power delivery / cooling issue.
|
||||||
if referencePowerW <= 0 {
|
// Over-TDP → power limit not properly enforced / power regulation fault.
|
||||||
referencePowerW = gpu.DefaultPowerLimitW
|
// Falls back to 0 if calibration was not performed (dcgmi unavailable).
|
||||||
}
|
{
|
||||||
if referencePowerW <= 0 {
|
ref := gpu.DefaultPowerLimitW
|
||||||
referencePowerW = gpu.PowerLimitW
|
if ref <= 0 {
|
||||||
}
|
ref = gpu.PowerLimitW
|
||||||
if referencePowerW > 0 {
|
}
|
||||||
score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100)
|
if gpu.CalibratedPeakPowerW > 0 && ref > 0 {
|
||||||
|
deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100
|
||||||
|
score.PowerSustainScore = clampScore(100 - deviationPct)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
||||||
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
|
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
|
||||||
@@ -887,8 +911,8 @@ func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
|||||||
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
// base 0.35 — floor so a GPU that fails all sustain checks still scores
|
||||||
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
|
// thermal 0.25 — heaviest: throttle counters are the most reliable signal
|
||||||
// stability 0.25 — clock/power variance matters for reproducibility
|
// stability 0.25 — clock/power variance matters for reproducibility
|
||||||
// power 0.15 — honest with calibrated reference; lower because
|
// power 0.15 — GPU reaches rated TDP under targeted_power? lower weight
|
||||||
// bee-gpu-burn is compute-only (not mem+compute like TDP test)
|
// because calibration may be absent (dcgmi not installed)
|
||||||
// NCCL bonus 0.10 — interconnect health
|
// NCCL bonus 0.10 — interconnect health
|
||||||
// cap 1.10
|
// cap 1.10
|
||||||
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
|
quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0)
|
||||||
@@ -1111,6 +1135,28 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|||||||
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
|
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
// Flag significant TDP deviation (over or under) from calibration.
|
||||||
|
if gpu.CalibratedPeakPowerW > 0 {
|
||||||
|
ref := gpu.DefaultPowerLimitW
|
||||||
|
if ref <= 0 {
|
||||||
|
ref = gpu.PowerLimitW
|
||||||
|
}
|
||||||
|
if ref > 0 {
|
||||||
|
deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100
|
||||||
|
switch {
|
||||||
|
case deviationPct < -10:
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.",
|
||||||
|
gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref,
|
||||||
|
))
|
||||||
|
case deviationPct > 5:
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.",
|
||||||
|
gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
||||||
|
|||||||
Reference in New Issue
Block a user