Use real NVIDIA power-limit bounds in benchmark

2026-04-20 09:26:56 +03:00
parent a94e8007f8
commit ab802719f8
2 changed files with 101 additions and 35 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
 	VBIOS                string
 	PowerLimitW          float64
 	DefaultPowerLimitW   float64
 	MinPowerLimitW       float64
 	MaxPowerLimitW       float64
 	MaxGraphicsClockMHz  float64
 	MaxMemoryClockMHz    float64
 	BaseGraphicsClockMHz float64
@@ -349,9 +351,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
 		result.Normalization.Status = "partial"
 	}
-	// Enrich with max clocks from verbose output — covers GPUs where
+	// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
-	// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
+	// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)
 	activeApps, err := queryActiveComputeApps(selected)
 	if err == nil && len(activeApps) > 0 {
@@ -735,8 +737,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 // (attribute.multiprocessor_count, power.default_limit) are not supported on
 // all driver versions, so we fall back to the base set if the full query fails.
 // The minimal fallback omits clock fields entirely — clocks.max.* returns
-// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
+// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
-// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
+// then recovered from nvidia-smi -q.
 var benchmarkGPUInfoQueries = []struct {
 	fields   string
 	extended bool // whether this query includes optional extended fields
@@ -756,12 +758,9 @@ var benchmarkGPUInfoQueries = []struct {
 	},
 }
-// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
+// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
-// any GPU in infoByIndex where those values are still zero.  It parses the
+// for fields that may be missing from --query-gpu on some driver versions.
-// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
+func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
 // This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
 // return exit status 2 but the verbose query works fine.
 func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
 	if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
 		return
 	}
@@ -782,6 +781,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 	maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
 	defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
 	currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
 	minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
 	maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
 	smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
 	shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
 	slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
@@ -841,6 +842,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 				}
 			}
 		}
 		if info.MinPowerLimitW == 0 {
 			if m := minPwrRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
 					info.MinPowerLimitW = v
 				}
 			}
 		}
 		if info.MaxPowerLimitW == 0 {
 			if m := maxPwrRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
 					info.MaxPowerLimitW = v
 				}
 			}
 		}
 		if info.MultiprocessorCount == 0 {
 			if m := smCountRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
@@ -3041,7 +3056,6 @@ func runBenchmarkPowerCalibration(
 	if calibDurationSec <= 0 {
 		calibDurationSec = 120
 	}
 	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
 	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
 	const calibSearchTolerance = 10
@@ -3088,8 +3102,9 @@ func runBenchmarkPowerCalibration(
 		originalLimitW int
 		appliedLimitW  int
 		minLimitW      int
-		lo             int // highest verified-stable limit (assumed: minLimitW)
+		lo             int // highest verified-stable limit
 		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
 		loVerified     bool
 		calib          benchmarkPowerCalibrationResult
 		converged      bool
 	}
@@ -3115,19 +3130,13 @@ func runBenchmarkPowerCalibration(
 		if appliedLimitW <= 0 {
 			appliedLimitW = defaultLimitW
 		}
-		minLimitW := appliedLimitW
+		minLimitW := int(math.Round(info.MinPowerLimitW))
-		switch {
+		if minLimitW <= 0 {
-		case defaultLimitW > 0:
+			minLimitW = appliedLimitW
 			minLimitW = defaultLimitW - maxDerateW
 			floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
 			if minLimitW < floorByRatio {
 				minLimitW = floorByRatio
 			}
 		case appliedLimitW > 0:
 			minLimitW = appliedLimitW - maxDerateW
 		}
-		if minLimitW < calibSearchTolerance {
+		maxLimitW := int(math.Round(info.MaxPowerLimitW))
-			minLimitW = calibSearchTolerance
+		if maxLimitW > 0 && appliedLimitW > maxLimitW {
 			appliedLimitW = maxLimitW
 		}
 		s := &gpuCalibState{
 			idx:            idx,
@@ -3139,11 +3148,24 @@ func runBenchmarkPowerCalibration(
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
 		if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
 			s.appliedLimitW = minLimitW
 			s.hi = minLimitW + 1
 		}
 		if info.MinPowerLimitW <= 0 {
 			s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
 		}
 		if seedLimits != nil {
 			if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
 				// A previously validated limit is only a starting point. Re-run
 				// targeted_power under the current multi-GPU thermal load and derate
 				// again if this step shows new throttling.
 				if seedW < s.minLimitW {
 					seedW = s.minLimitW
 				}
 				if maxLimitW > 0 && seedW > maxLimitW {
 					seedW = maxLimitW
 				}
 				if canDerate {
 					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
 				}
@@ -3331,6 +3353,7 @@ calibDone:
 				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				s.lo = s.appliedLimitW
 				s.loVerified = true
 				if canDerate && s.hi-s.lo > calibSearchTolerance {
 					next := roundTo5W((s.lo + s.hi) / 2)
 					if next > s.lo && next < s.hi {
@@ -3369,7 +3392,23 @@ calibDone:
 			s.hi = s.appliedLimitW
 			if s.hi-s.lo <= calibSearchTolerance {
-				if s.lo > s.minLimitW {
+				if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
 						s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
 						logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
 						s.converged = true
 						continue
 					}
 					s.appliedLimitW = s.minLimitW
 					s.calib.AppliedPowerLimitW = float64(s.minLimitW)
 					s.calib.Derated = s.minLimitW < s.originalLimitW
 					s.info.PowerLimitW = float64(s.minLimitW)
 					infoByIndex[s.idx] = s.info
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
 					logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
 					continue
 				}
 				if s.loVerified {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
 						s.appliedLimitW = s.lo
@@ -3381,7 +3420,8 @@ calibDone:
 						s.calib.Completed = true
 					}
 				} else {
-					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
 					logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
@@ -3396,9 +3436,7 @@ calibDone:
 				next = (s.lo + s.hi) / 2
 			}
 			if next < s.minLimitW {
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
+				next = s.minLimitW
 				s.converged = true
 				continue
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
 				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -356,12 +356,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	}
 }
-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Min Power Limit                       : 200.00 W
    Max Power Limit                       : 600.00 W
    Default Power Limit                   : 575.00 W
    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -383,7 +387,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -397,25 +401,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
 	}
 	if infoByIndex[0].MaxPowerLimitW != 600 {
 		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
 	}
 	if infoByIndex[0].DefaultPowerLimitW != 575 {
 		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
 	}
 	if infoByIndex[0].PowerLimitW != 560 {
 		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
 	}
 }
-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()
 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Min Power Limit                       : 100.00 W
    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
 			Index:               0,
 			BusID:               "00000000:4E:00.0",
 			MaxGraphicsClockMHz: 2430,
 			MaxMemoryClockMHz:   12481,
 			MinPowerLimitW:      200,
 			MaxPowerLimitW:      600,
 		},
 	}
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)
 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
 	if infoByIndex[0].MinPowerLimitW != 200 {
 		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
 	}
 }