diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 059b7c1..b0a426f 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -121,15 +121,22 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv var serverIdleOK, serverLoadedOK bool var serverLoadedSamples int + // Run nvidia-smi -q first: used both for the log file and as a fallback + // source of max clock values when CSV clock fields are unsupported. + var nvsmiQOut []byte + if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { + nvsmiQOut = out + _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) + } + infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) if infoErr != nil { result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) result.Normalization.Status = "partial" } - - if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { - _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) - } + // Enrich with max clocks from verbose output — covers GPUs where + // clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x). + enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut) activeApps, err := queryActiveComputeApps(selected) if err == nil && len(activeApps) > 0 { @@ -370,9 +377,13 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { // Fields are tried in order; the first successful query wins. Extended fields // (attribute.multiprocessor_count, power.default_limit) are not supported on // all driver versions, so we fall back to the base set if the full query fails. +// The minimal fallback omits clock fields entirely — clocks.max.* returns +// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are +// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks. var benchmarkGPUInfoQueries = []struct { fields string extended bool // whether this query includes optional extended fields + minimal bool // clock fields omitted; max clocks must be filled separately }{ { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit", @@ -382,6 +393,83 @@ var benchmarkGPUInfoQueries = []struct { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics", extended: false, }, + { + fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit", + minimal: true, + }, +} + +// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for +// any GPU in infoByIndex where those values are still zero. It parses the +// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ). +// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields +// return exit status 2 but the verbose query works fine. +func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) { + if len(infoByIndex) == 0 || len(nvsmiQ) == 0 { + return + } + + // Build bus_id → index map for matching verbose sections to GPU indices. + busToBenchIdx := make(map[string]int, len(infoByIndex)) + for idx, info := range infoByIndex { + if info.BusID != "" { + // nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain), + // while --query-gpu returns the same format; normalise to lower. + busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx + } + } + + // Split the verbose output into per-GPU sections on "^GPU " lines. + gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`) + maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) + maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) + + sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) + for i, loc := range sectionStarts { + busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]])) + benchIdx, ok := busToBenchIdx[busID] + if !ok { + // Bus IDs from verbose output may have a different domain prefix; + // try suffix match on the slot portion (XX:XX.X). + for k, v := range busToBenchIdx { + if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) { + benchIdx = v + ok = true + break + } + } + } + if !ok { + continue + } + + info := infoByIndex[benchIdx] + if info.MaxGraphicsClockMHz > 0 && info.MaxMemoryClockMHz > 0 { + continue // already populated + } + + end := len(nvsmiQ) + if i+1 < len(sectionStarts) { + end = sectionStarts[i+1][0] + } + section := nvsmiQ[loc[0]:end] + + if info.MaxGraphicsClockMHz == 0 { + if m := maxGfxRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil { + info.MaxGraphicsClockMHz = v + } + } + } + if info.MaxMemoryClockMHz == 0 { + if m := maxMemRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil { + info.MaxMemoryClockMHz = v + } + } + } + infoByIndex[benchIdx] = info + } } func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { @@ -409,9 +497,13 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { continue } + minFields := 6 + if !q.minimal { + minFields = 9 + } infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) for _, row := range rows { - if len(row) < 9 { + if len(row) < minFields { continue } idx, err := strconv.Atoi(strings.TrimSpace(row[0])) @@ -419,24 +511,26 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { continue } info := benchmarkGPUInfo{ - Index: idx, - UUID: strings.TrimSpace(row[1]), - Name: strings.TrimSpace(row[2]), - BusID: strings.TrimSpace(row[3]), - VBIOS: strings.TrimSpace(row[4]), - PowerLimitW: parseBenchmarkFloat(row[5]), - MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]), - MaxMemoryClockMHz: parseBenchmarkFloat(row[7]), + Index: idx, + UUID: strings.TrimSpace(row[1]), + Name: strings.TrimSpace(row[2]), + BusID: strings.TrimSpace(row[3]), + VBIOS: strings.TrimSpace(row[4]), + PowerLimitW: parseBenchmarkFloat(row[5]), } - if len(row) >= 9 { - info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) - } - if q.extended { - if len(row) >= 10 { - info.MultiprocessorCount = int(parseBenchmarkFloat(row[9])) + if !q.minimal { + info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6]) + info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7]) + if len(row) >= 9 { + info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) } - if len(row) >= 11 { - info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) + if q.extended { + if len(row) >= 10 { + info.MultiprocessorCount = int(parseBenchmarkFloat(row[9])) + } + if len(row) >= 11 { + info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) + } } } infoByIndex[idx] = info diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index 12463c7..b8cb5f4 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -178,3 +178,67 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) { t.Fatalf("report should not contain ANSI escapes\n%s", report) } } + +func TestEnrichGPUInfoWithMaxClocks(t *testing.T) { + t.Parallel() + + nvsmiQ := []byte(` +GPU 00000000:4E:00.0 + Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition + Clocks + Graphics : 2422 MHz + Memory : 12481 MHz + Max Clocks + Graphics : 2430 MHz + SM : 2430 MHz + Memory : 12481 MHz + Video : 2107 MHz + +GPU 00000000:4F:00.0 + Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition + Max Clocks + Graphics : 2430 MHz + Memory : 12481 MHz +`) + + infoByIndex := map[int]benchmarkGPUInfo{ + 0: {Index: 0, BusID: "00000000:4E:00.0"}, + 1: {Index: 1, BusID: "00000000:4F:00.0"}, + } + + enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ) + + if infoByIndex[0].MaxGraphicsClockMHz != 2430 { + t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz) + } + if infoByIndex[0].MaxMemoryClockMHz != 12481 { + t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz) + } + if infoByIndex[1].MaxGraphicsClockMHz != 2430 { + t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz) + } + if infoByIndex[1].MaxMemoryClockMHz != 12481 { + t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz) + } +} + +func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) { + t.Parallel() + + nvsmiQ := []byte(` +GPU 00000000:4E:00.0 + Max Clocks + Graphics : 9999 MHz + Memory : 9999 MHz +`) + // Already populated — must not be overwritten. + infoByIndex := map[int]benchmarkGPUInfo{ + 0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481}, + } + + enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ) + + if infoByIndex[0].MaxGraphicsClockMHz != 2430 { + t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz) + } +}