diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 6984726..61bb493 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -37,6 +37,8 @@ type benchmarkGPUInfo struct { VBIOS string PowerLimitW float64 DefaultPowerLimitW float64 + MinPowerLimitW float64 + MaxPowerLimitW float64 MaxGraphicsClockMHz float64 MaxMemoryClockMHz float64 BaseGraphicsClockMHz float64 @@ -349,9 +351,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) result.Normalization.Status = "partial" } - // Enrich with max clocks from verbose output — covers GPUs where - // clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x). - enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut) + // Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields + // are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x). + enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut) activeApps, err := queryActiveComputeApps(selected) if err == nil && len(activeApps) > 0 { @@ -735,8 +737,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { // (attribute.multiprocessor_count, power.default_limit) are not supported on // all driver versions, so we fall back to the base set if the full query fails. // The minimal fallback omits clock fields entirely — clocks.max.* returns -// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are -// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks. +// exit status 2 on some GPU generations (e.g. Blackwell); missing data is +// then recovered from nvidia-smi -q. var benchmarkGPUInfoQueries = []struct { fields string extended bool // whether this query includes optional extended fields @@ -756,12 +758,9 @@ var benchmarkGPUInfoQueries = []struct { }, } -// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for -// any GPU in infoByIndex where those values are still zero. It parses the -// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ). -// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields -// return exit status 2 but the verbose query works fine. -func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) { +// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q +// for fields that may be missing from --query-gpu on some driver versions. +func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) { if len(infoByIndex) == 0 || len(nvsmiQ) == 0 { return } @@ -782,6 +781,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) + minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`) + maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`) smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`) slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`) @@ -841,6 +842,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b } } } + if info.MinPowerLimitW == 0 { + if m := minPwrRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { + info.MinPowerLimitW = v + } + } + } + if info.MaxPowerLimitW == 0 { + if m := maxPwrRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { + info.MaxPowerLimitW = v + } + } + } if info.MultiprocessorCount == 0 { if m := smCountRe.FindSubmatch(section); m != nil { if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 { @@ -3041,7 +3056,6 @@ func runBenchmarkPowerCalibration( if calibDurationSec <= 0 { calibDurationSec = 120 } - const maxDerateW = 150 // calibSearchTolerance is the binary-search convergence threshold in watts. // When hi-lo ≤ this, the highest verified-stable limit (lo) is used. const calibSearchTolerance = 10 @@ -3088,8 +3102,9 @@ func runBenchmarkPowerCalibration( originalLimitW int appliedLimitW int minLimitW int - lo int // highest verified-stable limit (assumed: minLimitW) + lo int // highest verified-stable limit hi int // lowest verified-unstable limit (exclusive sentinel above start) + loVerified bool calib benchmarkPowerCalibrationResult converged bool } @@ -3115,19 +3130,13 @@ func runBenchmarkPowerCalibration( if appliedLimitW <= 0 { appliedLimitW = defaultLimitW } - minLimitW := appliedLimitW - switch { - case defaultLimitW > 0: - minLimitW = defaultLimitW - maxDerateW - floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70)) - if minLimitW < floorByRatio { - minLimitW = floorByRatio - } - case appliedLimitW > 0: - minLimitW = appliedLimitW - maxDerateW + minLimitW := int(math.Round(info.MinPowerLimitW)) + if minLimitW <= 0 { + minLimitW = appliedLimitW } - if minLimitW < calibSearchTolerance { - minLimitW = calibSearchTolerance + maxLimitW := int(math.Round(info.MaxPowerLimitW)) + if maxLimitW > 0 && appliedLimitW > maxLimitW { + appliedLimitW = maxLimitW } s := &gpuCalibState{ idx: idx, @@ -3139,11 +3148,24 @@ func runBenchmarkPowerCalibration( hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)}, } + if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW { + s.appliedLimitW = minLimitW + s.hi = minLimitW + 1 + } + if info.MinPowerLimitW <= 0 { + s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit") + } if seedLimits != nil { if seedW, ok := seedLimits[idx]; ok && seedW > 0 { // A previously validated limit is only a starting point. Re-run // targeted_power under the current multi-GPU thermal load and derate // again if this step shows new throttling. + if seedW < s.minLimitW { + seedW = s.minLimitW + } + if maxLimitW > 0 && seedW > maxLimitW { + seedW = maxLimitW + } if canDerate { _ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW) } @@ -3331,6 +3353,7 @@ calibDone: s.calib.AppliedPowerLimitW = float64(s.appliedLimitW) logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) s.lo = s.appliedLimitW + s.loVerified = true if canDerate && s.hi-s.lo > calibSearchTolerance { next := roundTo5W((s.lo + s.hi) / 2) if next > s.lo && next < s.hi { @@ -3369,7 +3392,23 @@ calibDone: s.hi = s.appliedLimitW if s.hi-s.lo <= calibSearchTolerance { - if s.lo > s.minLimitW { + if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW { + if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil { + s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error()) + logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err)) + s.converged = true + continue + } + s.appliedLimitW = s.minLimitW + s.calib.AppliedPowerLimitW = float64(s.minLimitW) + s.calib.Derated = s.minLimitW < s.originalLimitW + s.info.PowerLimitW = float64(s.minLimitW) + infoByIndex[s.idx] = s.info + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW)) + logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW)) + continue + } + if s.loVerified { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi)) if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil { s.appliedLimitW = s.lo @@ -3381,7 +3420,8 @@ calibDone: s.calib.Completed = true } } else { - s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW)) + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW)) + logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW)) } s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx) s.converged = true @@ -3396,9 +3436,7 @@ calibDone: next = (s.lo + s.hi) / 2 } if next < s.minLimitW { - s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW)) - s.converged = true - continue + next = s.minLimitW } if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil { s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error()) diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index 9b2c5da..19671d9 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -356,12 +356,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) { } } -func TestEnrichGPUInfoWithMaxClocks(t *testing.T) { +func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) { t.Parallel() nvsmiQ := []byte(` GPU 00000000:4E:00.0 Product Name : NVIDIA RTX PRO 6000 Blackwell Server Edition + Min Power Limit : 200.00 W + Max Power Limit : 600.00 W + Default Power Limit : 575.00 W + Current Power Limit : 560.00 W Clocks Graphics : 2422 MHz Memory : 12481 MHz @@ -383,7 +387,7 @@ GPU 00000000:4F:00.0 1: {Index: 1, BusID: "00000000:4F:00.0"}, } - enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ) + enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ) if infoByIndex[0].MaxGraphicsClockMHz != 2430 { t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz) @@ -397,25 +401,49 @@ GPU 00000000:4F:00.0 if infoByIndex[1].MaxMemoryClockMHz != 12481 { t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz) } + if infoByIndex[0].MinPowerLimitW != 200 { + t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW) + } + if infoByIndex[0].MaxPowerLimitW != 600 { + t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW) + } + if infoByIndex[0].DefaultPowerLimitW != 575 { + t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW) + } + if infoByIndex[0].PowerLimitW != 560 { + t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW) + } } -func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) { +func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) { t.Parallel() nvsmiQ := []byte(` GPU 00000000:4E:00.0 + Min Power Limit : 100.00 W + Max Power Limit : 900.00 W Max Clocks Graphics : 9999 MHz Memory : 9999 MHz `) // Already populated — must not be overwritten. infoByIndex := map[int]benchmarkGPUInfo{ - 0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481}, + 0: { + Index: 0, + BusID: "00000000:4E:00.0", + MaxGraphicsClockMHz: 2430, + MaxMemoryClockMHz: 12481, + MinPowerLimitW: 200, + MaxPowerLimitW: 600, + }, } - enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ) + enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ) if infoByIndex[0].MaxGraphicsClockMHz != 2430 { t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz) } + if infoByIndex[0].MinPowerLimitW != 200 { + t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW) + } }