Fix GPU clock lock normalization for Blackwell (clocks.max.* unsupported)
clocks.max.graphics / clocks.max.memory CSV fields return exit status 2 on RTX PRO 6000 Blackwell (driver 98.x), causing the entire gpu inventory query to fail and clock lock to be skipped → normalization: partial. Fix: - Add minimal fallback query (index,uuid,name,pci.bus_id,vbios_version, power.limit) that succeeds even without clock fields - Add enrichGPUInfoWithMaxClocks: parses "Max Clocks" section of nvidia-smi -q verbose output to fill MaxGraphicsClockMHz / MaxMemoryClockMHz when CSV fields fail - Move nvidia-smi -q execution before queryBenchmarkGPUInfo so its output is available for clock enrichment immediately after - Tests: cover enrichment and skip-if-populated cases Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -121,15 +121,22 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
var serverIdleOK, serverLoadedOK bool
|
var serverIdleOK, serverLoadedOK bool
|
||||||
var serverLoadedSamples int
|
var serverLoadedSamples int
|
||||||
|
|
||||||
|
// Run nvidia-smi -q first: used both for the log file and as a fallback
|
||||||
|
// source of max clock values when CSV clock fields are unsupported.
|
||||||
|
var nvsmiQOut []byte
|
||||||
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||||
|
nvsmiQOut = out
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||||
|
}
|
||||||
|
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
if infoErr != nil {
|
if infoErr != nil {
|
||||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||||
result.Normalization.Status = "partial"
|
result.Normalization.Status = "partial"
|
||||||
}
|
}
|
||||||
|
// Enrich with max clocks from verbose output — covers GPUs where
|
||||||
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
|
||||||
}
|
|
||||||
|
|
||||||
activeApps, err := queryActiveComputeApps(selected)
|
activeApps, err := queryActiveComputeApps(selected)
|
||||||
if err == nil && len(activeApps) > 0 {
|
if err == nil && len(activeApps) > 0 {
|
||||||
@@ -370,9 +377,13 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
// Fields are tried in order; the first successful query wins. Extended fields
|
// Fields are tried in order; the first successful query wins. Extended fields
|
||||||
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||||
// all driver versions, so we fall back to the base set if the full query fails.
|
// all driver versions, so we fall back to the base set if the full query fails.
|
||||||
|
// The minimal fallback omits clock fields entirely — clocks.max.* returns
|
||||||
|
// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
|
||||||
|
// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
|
||||||
var benchmarkGPUInfoQueries = []struct {
|
var benchmarkGPUInfoQueries = []struct {
|
||||||
fields string
|
fields string
|
||||||
extended bool // whether this query includes optional extended fields
|
extended bool // whether this query includes optional extended fields
|
||||||
|
minimal bool // clock fields omitted; max clocks must be filled separately
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||||
@@ -382,6 +393,83 @@ var benchmarkGPUInfoQueries = []struct {
|
|||||||
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
|
||||||
extended: false,
|
extended: false,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit",
|
||||||
|
minimal: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
|
||||||
|
// any GPU in infoByIndex where those values are still zero. It parses the
|
||||||
|
// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
|
||||||
|
// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
|
||||||
|
// return exit status 2 but the verbose query works fine.
|
||||||
|
func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
|
||||||
|
if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build bus_id → index map for matching verbose sections to GPU indices.
|
||||||
|
busToBenchIdx := make(map[string]int, len(infoByIndex))
|
||||||
|
for idx, info := range infoByIndex {
|
||||||
|
if info.BusID != "" {
|
||||||
|
// nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain),
|
||||||
|
// while --query-gpu returns the same format; normalise to lower.
|
||||||
|
busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split the verbose output into per-GPU sections on "^GPU " lines.
|
||||||
|
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
|
||||||
|
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
|
||||||
|
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
|
||||||
|
|
||||||
|
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
|
||||||
|
for i, loc := range sectionStarts {
|
||||||
|
busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]]))
|
||||||
|
benchIdx, ok := busToBenchIdx[busID]
|
||||||
|
if !ok {
|
||||||
|
// Bus IDs from verbose output may have a different domain prefix;
|
||||||
|
// try suffix match on the slot portion (XX:XX.X).
|
||||||
|
for k, v := range busToBenchIdx {
|
||||||
|
if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) {
|
||||||
|
benchIdx = v
|
||||||
|
ok = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
info := infoByIndex[benchIdx]
|
||||||
|
if info.MaxGraphicsClockMHz > 0 && info.MaxMemoryClockMHz > 0 {
|
||||||
|
continue // already populated
|
||||||
|
}
|
||||||
|
|
||||||
|
end := len(nvsmiQ)
|
||||||
|
if i+1 < len(sectionStarts) {
|
||||||
|
end = sectionStarts[i+1][0]
|
||||||
|
}
|
||||||
|
section := nvsmiQ[loc[0]:end]
|
||||||
|
|
||||||
|
if info.MaxGraphicsClockMHz == 0 {
|
||||||
|
if m := maxGfxRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
|
||||||
|
info.MaxGraphicsClockMHz = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if info.MaxMemoryClockMHz == 0 {
|
||||||
|
if m := maxMemRe.FindSubmatch(section); m != nil {
|
||||||
|
if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil {
|
||||||
|
info.MaxMemoryClockMHz = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
infoByIndex[benchIdx] = info
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||||
@@ -409,9 +497,13 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
minFields := 6
|
||||||
|
if !q.minimal {
|
||||||
|
minFields = 9
|
||||||
|
}
|
||||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||||
for _, row := range rows {
|
for _, row := range rows {
|
||||||
if len(row) < 9 {
|
if len(row) < minFields {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||||||
@@ -419,24 +511,26 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
info := benchmarkGPUInfo{
|
info := benchmarkGPUInfo{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
UUID: strings.TrimSpace(row[1]),
|
UUID: strings.TrimSpace(row[1]),
|
||||||
Name: strings.TrimSpace(row[2]),
|
Name: strings.TrimSpace(row[2]),
|
||||||
BusID: strings.TrimSpace(row[3]),
|
BusID: strings.TrimSpace(row[3]),
|
||||||
VBIOS: strings.TrimSpace(row[4]),
|
VBIOS: strings.TrimSpace(row[4]),
|
||||||
PowerLimitW: parseBenchmarkFloat(row[5]),
|
PowerLimitW: parseBenchmarkFloat(row[5]),
|
||||||
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
|
||||||
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
|
||||||
}
|
}
|
||||||
if len(row) >= 9 {
|
if !q.minimal {
|
||||||
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6])
|
||||||
}
|
info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7])
|
||||||
if q.extended {
|
if len(row) >= 9 {
|
||||||
if len(row) >= 10 {
|
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||||
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
|
||||||
}
|
}
|
||||||
if len(row) >= 11 {
|
if q.extended {
|
||||||
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
if len(row) >= 10 {
|
||||||
|
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||||
|
}
|
||||||
|
if len(row) >= 11 {
|
||||||
|
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
infoByIndex[idx] = info
|
infoByIndex[idx] = info
|
||||||
|
|||||||
@@ -178,3 +178,67 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
|||||||
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
nvsmiQ := []byte(`
|
||||||
|
GPU 00000000:4E:00.0
|
||||||
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Clocks
|
||||||
|
Graphics : 2422 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 2430 MHz
|
||||||
|
SM : 2430 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
Video : 2107 MHz
|
||||||
|
|
||||||
|
GPU 00000000:4F:00.0
|
||||||
|
Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 2430 MHz
|
||||||
|
Memory : 12481 MHz
|
||||||
|
`)
|
||||||
|
|
||||||
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
|
0: {Index: 0, BusID: "00000000:4E:00.0"},
|
||||||
|
1: {Index: 1, BusID: "00000000:4F:00.0"},
|
||||||
|
}
|
||||||
|
|
||||||
|
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[0].MaxMemoryClockMHz != 12481 {
|
||||||
|
t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
if infoByIndex[1].MaxMemoryClockMHz != 12481 {
|
||||||
|
t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
nvsmiQ := []byte(`
|
||||||
|
GPU 00000000:4E:00.0
|
||||||
|
Max Clocks
|
||||||
|
Graphics : 9999 MHz
|
||||||
|
Memory : 9999 MHz
|
||||||
|
`)
|
||||||
|
// Already populated — must not be overwritten.
|
||||||
|
infoByIndex := map[int]benchmarkGPUInfo{
|
||||||
|
0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
|
||||||
|
}
|
||||||
|
|
||||||
|
enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
|
||||||
|
|
||||||
|
if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
|
||||||
|
t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user